!--------------------------------------------------------------------------------------------------!
! Copyright (C) by the DBCSR developers group - All rights reserved                                !
! This file is part of the DBCSR library.                                                          !
!                                                                                                  !
! For information on the license, see the LICENSE file.                                            !
! For further information please visit https://dbcsr.cp2k.org                                      !
! SPDX-License-Identifier: GPL-2.0+                                                                !
!--------------------------------------------------------------------------------------------------!

MODULE dbcsr_transformations
   !! DBCSR transformations
   USE dbcsr_array_types, ONLY: array_data, &
                                array_hold, &
                                array_i1d_obj, &
                                array_release
   USE dbcsr_block_access, ONLY: dbcsr_get_block_p, &
                                 dbcsr_put_block
   USE dbcsr_block_operations, ONLY: dbcsr_block_conjg, &
                                     dbcsr_block_partial_copy, &
                                     dbcsr_block_real_neg, &
                                     dbcsr_block_transpose, &
                                     dbcsr_data_clear, &
                                     dbcsr_data_copy, &
                                     dbcsr_data_set
   USE dbcsr_data_methods, ONLY: &
      dbcsr_data_clear_pointer, dbcsr_data_ensure_size, dbcsr_data_get_memory_type, &
      dbcsr_data_get_size, dbcsr_data_get_size_referenced, dbcsr_data_get_type, dbcsr_data_hold, &
      dbcsr_data_init, dbcsr_data_new, dbcsr_data_release, dbcsr_data_set_pointer, &
      dbcsr_data_set_size_referenced, dbcsr_get_data, dbcsr_type_1d_to_2d
   USE dbcsr_data_operations, ONLY: dbcsr_copy_sort_data, &
                                    dbcsr_switch_data_area
   USE dbcsr_dist_methods, ONLY: &
      dbcsr_distribution_col_dist, dbcsr_distribution_hold, dbcsr_distribution_local_cols, &
      dbcsr_distribution_local_rows, dbcsr_distribution_mp, dbcsr_distribution_ncols, &
      dbcsr_distribution_nlocal_cols, dbcsr_distribution_nlocal_rows, dbcsr_distribution_nrows, &
      dbcsr_distribution_release, dbcsr_distribution_row_dist
   USE dbcsr_dist_operations, ONLY: dbcsr_get_local_cols, &
                                    dbcsr_get_local_rows, &
                                    dbcsr_get_stored_coordinates, &
                                    dbcsr_reblocking_targets, &
                                    dbcsr_transpose_dims, &
                                    dbcsr_transpose_distribution
   USE dbcsr_dist_util, ONLY: convert_sizes_to_offsets, &
                              dbcsr_checksum, &
                              dbcsr_pack_meta, &
                              dbcsr_unpack_meta, &
                              dbcsr_verify_matrix, &
                              get_internal_offsets, &
                              global_offsets_to_local, &
                              nfull_elements
   USE dbcsr_index_operations, ONLY: dbcsr_addto_index_array, &
                                     dbcsr_clearfrom_index_array, &
                                     dbcsr_repoint_index, &
                                     make_dense_index, &
                                     make_undense_index, &
                                     transpose_index_local
   USE dbcsr_iterator_operations, ONLY: dbcsr_iterator_blocks_left, &
                                        dbcsr_iterator_next_block, &
                                        dbcsr_iterator_start, &
                                        dbcsr_iterator_stop
   USE dbcsr_kinds, ONLY: dp, &
                          int_8, &
                          sp
   USE dbcsr_methods, ONLY: &
      dbcsr_col_block_sizes, dbcsr_distribution, dbcsr_get_data_type, dbcsr_get_matrix_type, &
      dbcsr_has_symmetry, dbcsr_nblkcols_total, dbcsr_nblkrows_total, dbcsr_nfullcols_total, &
      dbcsr_nfullrows_total, dbcsr_release, dbcsr_row_block_sizes, dbcsr_valid_index
   USE dbcsr_mp_methods, ONLY: &
      dbcsr_mp_grid_remove, dbcsr_mp_grid_setup, dbcsr_mp_group, dbcsr_mp_has_subgroups, &
      dbcsr_mp_my_col_group, dbcsr_mp_my_row_group, dbcsr_mp_mynode, dbcsr_mp_mypcol, &
      dbcsr_mp_myprow, dbcsr_mp_npcols, dbcsr_mp_nprows, dbcsr_mp_numnodes, dbcsr_mp_pgrid
   USE dbcsr_mp_operations, ONLY: dbcsr_allgatherv, &
                                  hybrid_alltoall_any, &
                                  hybrid_alltoall_c1, &
                                  hybrid_alltoall_d1, &
                                  hybrid_alltoall_i1, &
                                  hybrid_alltoall_s1, &
                                  hybrid_alltoall_z1
   USE dbcsr_mpiwrap, ONLY: mp_allgather, &
                            mp_alltoall, mp_comm_type
   USE dbcsr_operations, ONLY: dbcsr_set
   USE dbcsr_ptr_util, ONLY: pointer_view
   USE dbcsr_types, ONLY: &
      dbcsr_data_obj, dbcsr_distribution_obj, dbcsr_iterator, dbcsr_meta_size, dbcsr_mp_obj, &
      dbcsr_repl_col, dbcsr_repl_full, dbcsr_repl_none, dbcsr_repl_row, dbcsr_slot_blk_p, &
      dbcsr_slot_col_i, dbcsr_slot_home_pcol, dbcsr_slot_home_prow, dbcsr_slot_nblks, &
      dbcsr_slot_nze, dbcsr_slot_row_p, dbcsr_type, dbcsr_type_complex_4, dbcsr_type_complex_8, &
      dbcsr_type_no_symmetry, dbcsr_type_real_4, dbcsr_type_real_8
   USE dbcsr_work_operations, ONLY: dbcsr_create, &
                                    dbcsr_finalize, &
                                    dbcsr_work_create
#include "base/dbcsr_base_uses.f90"

   IMPLICIT NONE
   PRIVATE

   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'dbcsr_transformations'

   LOGICAL, PARAMETER, PRIVATE          :: careful_mod = .FALSE.

   PUBLIC :: dbcsr_desymmetrize_deep, &
             dbcsr_new_transposed, &
             dbcsr_transposed, &
             dbcsr_complete_redistribute, &
             dbcsr_redistribute, dbcsr_make_untransposed_blocks
   PUBLIC :: dbcsr_replicate_all, dbcsr_distribute, dbcsr_datablock_redistribute
   PUBLIC :: dbcsr_make_dense, dbcsr_make_undense, dbcsr_make_dense_low

CONTAINS

   SUBROUTINE dbcsr_new_transposed(transposed, normal, shallow_data_copy, &
                                   transpose_data, transpose_distribution, transpose_index, &
                                   use_distribution, redistribute)
      !! Transposes a DBCSR matrix.
      !!
      !! Distribution options
      !! By default the distribution is transposed. If transpose_distribution
      !! is false, then an undetermined distribution is created that is
      !! compatible with the same process grid.

      TYPE(dbcsr_type), INTENT(INOUT)                    :: transposed
         !! transposed DBCSR matrix
      TYPE(dbcsr_type), INTENT(IN)                       :: normal
         !! input DBCSR matrix
      LOGICAL, INTENT(IN), OPTIONAL                      :: shallow_data_copy, transpose_data, &
                                                            transpose_distribution, transpose_index
         !! only shallow data_copy; default is no; if set, the transpose_data option is ignored
         !! transpose data blocks, default is True
         !! transpose the distribution from the input matrix, default is True
         !! transpose the index (default=yes) or turn it into BCSC
      TYPE(dbcsr_distribution_obj), INTENT(IN), OPTIONAL :: use_distribution
         !! use this distribution
      LOGICAL, INTENT(IN), OPTIONAL                      :: redistribute
         !! redistributes the matrix; default is .TRUE. unless shallow or transpose_distribution are set.

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_new_transposed'
      LOGICAL, PARAMETER                                 :: dbg = .FALSE.

      INTEGER                                            :: handle, stat
      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: blk_p
      LOGICAL                                            :: redist, shallow, tr_blocks, tr_dist, &
                                                            tr_index
      TYPE(dbcsr_distribution_obj)                       :: new_dist
      TYPE(dbcsr_type)                                   :: t2

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)

      IF (.NOT. dbcsr_valid_index(normal)) &
         DBCSR_ABORT("Matrix does not exist.")
      ! Internalize options
      shallow = .FALSE.
      IF (PRESENT(shallow_data_copy)) shallow = shallow_data_copy
      tr_blocks = .TRUE.
      IF (PRESENT(transpose_data)) tr_blocks = transpose_data
      tr_dist = .TRUE.
      IF (PRESENT(transpose_distribution)) tr_dist = transpose_distribution
      tr_index = .TRUE.
      IF (PRESENT(transpose_index)) tr_index = transpose_index
      ! Prepare the distribution for the transposed matrix
      IF (PRESENT(use_distribution)) THEN
         IF (dbcsr_distribution_nrows(use_distribution) /= dbcsr_distribution_ncols(normal%dist)) &
            DBCSR_ABORT("Given distribution must be compatible with the current distribution")
         IF (dbcsr_distribution_ncols(use_distribution) /= dbcsr_distribution_nrows(normal%dist)) &
            DBCSR_ABORT("Given distribution must be compatible with the current distribution")
         new_dist = use_distribution
         CALL dbcsr_distribution_hold(new_dist)
      ELSE
         IF (tr_dist) THEN
            CALL dbcsr_transpose_distribution(new_dist, normal%dist)
         ELSE
            CALL dbcsr_transpose_dims(new_dist, normal%dist)
         END IF
      END IF
      ! Create the transposed matrix
      CALL dbcsr_create(transposed, name="transposed "//TRIM(normal%name), &
                        template=normal, &
                        dist=new_dist, &
                        row_blk_size_obj=normal%col_blk_size, &
                        col_blk_size_obj=normal%row_blk_size, &
                        matrix_type=dbcsr_get_matrix_type(normal), &
                        max_rbs=normal%max_cbs, &
                        max_cbs=normal%max_rbs, &
                        row_blk_offset=normal%col_blk_offset, &
                        col_blk_offset=normal%row_blk_offset)
      CALL dbcsr_distribution_release(new_dist)
      ! Reserve the space for the new indices.
      IF (tr_index) THEN
         CALL dbcsr_addto_index_array(transposed, dbcsr_slot_row_p, &
                                      reservation=transposed%nblkrows_total + 1, extra=transposed%nblks*2)
      ELSE
         CALL dbcsr_addto_index_array(transposed, dbcsr_slot_row_p, &
                                      reservation=normal%nblkrows_total + 1, extra=transposed%nblks*2)
      END IF
      CALL dbcsr_addto_index_array(transposed, dbcsr_slot_col_i, &
                                   reservation=normal%nblks)
      CALL dbcsr_addto_index_array(transposed, dbcsr_slot_blk_p, &
                                   reservation=normal%nblks)
      CALL dbcsr_repoint_index(transposed)
      IF (.NOT. shallow) THEN
         CALL dbcsr_data_ensure_size(transposed%data_area, &
                                     dbcsr_data_get_size_referenced(normal%data_area), &
                                     nocopy=.TRUE.)
      END IF
      !
      transposed%nblks = normal%nblks
      transposed%nze = normal%nze
      transposed%index(dbcsr_slot_nblks) = normal%nblks
      transposed%index(dbcsr_slot_nze) = normal%nze
      ! Transpose the local index.
      ALLOCATE (blk_p(normal%nblks), stat=stat)
      IF (stat /= 0) DBCSR_ABORT("blk_p")
      IF (tr_index) THEN
         CALL transpose_index_local(transposed%row_p, transposed%col_i, &
                                    normal%row_p, normal%col_i, blk_p, normal%blk_p)
         IF (dbg) THEN
            WRITE (*, *) 'orig. row_p', normal%row_p
            WRITE (*, *) 'orig. col_i', normal%col_i
            WRITE (*, *) 'orig. blk_p', normal%blk_p
            WRITE (*, *) 'new . row_p', transposed%row_p
            WRITE (*, *) 'new . col_i', transposed%col_i
            WRITE (*, *) 'new . blk_p', blk_p!transposed%blk_p
         END IF
      ELSE
         transposed%row_p(:) = normal%row_p(:)
         transposed%col_i(:) = normal%col_i(:)
         blk_p(:) = normal%blk_p(:)
         !transposed%transpose = .TRUE.
      END IF
      ! Copy the data
      IF (shallow) THEN
         CALL dbcsr_switch_data_area(transposed, normal%data_area)
         transposed%blk_p(1:transposed%nblks) = &
            -blk_p(1:transposed%nblks)
      ELSE
         CALL dbcsr_copy_sort_data(transposed%blk_p, blk_p, transposed%row_p, &
                                   transposed%col_i, array_data(transposed%row_blk_size), &
                                   array_data(transposed%col_blk_size), &
                                   transposed%data_area, normal%data_area, &
                                   mark_transposed=.not. tr_blocks, &
                                   transpose_blocks=tr_blocks)
      END IF
      transposed%valid = .TRUE.
      !CALL dbcsr_copy_sort_data (transposed%blk_p, blk_p, transposed%row_p,&
      !     transposed%col_i, array_data (transposed%row_blk_size),&
      !     array_data (transposed%col_blk_size),&
      !     transposed%data_area, normal%data_area,&
      !     transpose_blocks=.TRUE.)
      !
1315  FORMAT(I5, 1X, I5, 1X, I5, 1X, I5, 1X, I5, 1X, I5, 1X, I5, 1X, I5, 1X, I5, 1X, I5)
      IF (dbg) THEN
         WRITE (*, *) 'new FINAL index'
         WRITE (*, 1315) transposed%row_p
         WRITE (*, 1315) transposed%col_i
         WRITE (*, 1315) transposed%blk_p
      END IF
      !
      IF (tr_index) DEALLOCATE (blk_p)
      !
      IF (PRESENT(redistribute)) THEN
         redist = redistribute
      ELSE
         redist = .NOT. tr_dist .AND. .NOT. shallow
      END IF
      IF (redist) THEN
         !write (*,*)routineN//" redistributing"
         CALL dbcsr_create(t2, template=transposed)
         CALL dbcsr_redistribute(transposed, t2)
         CALL dbcsr_release(transposed)
         transposed = t2
      END IF
      CALL timestop(handle)
   END SUBROUTINE dbcsr_new_transposed

   SUBROUTINE dbcsr_transposed(transposed, normal, shallow_data_copy, &
      !! Transposes a DBCSR matrix, keeping the same distribution
                               transpose_data, transpose_distribution, use_distribution)

      TYPE(dbcsr_type), INTENT(INOUT)                    :: transposed
      TYPE(dbcsr_type), INTENT(IN)                       :: normal
      LOGICAL, INTENT(IN), OPTIONAL                      :: shallow_data_copy, transpose_data, &
                                                            transpose_distribution
      TYPE(dbcsr_distribution_obj), INTENT(IN), &
         OPTIONAL                                        :: use_distribution

      LOGICAL                                            :: myshallow_data_copy, &
                                                            mytranspose_distribution
      TYPE(dbcsr_distribution_obj)                           :: myuse_distribution

!   set some defaults to make usage a bit less painful (fschiff)

      myshallow_data_copy = .FALSE.
      myuse_distribution = normal%dist
      mytranspose_distribution = .FALSE.
      IF (PRESENT(shallow_data_copy)) myshallow_data_copy = shallow_data_copy
      IF (PRESENT(use_distribution)) myuse_distribution = use_distribution
      IF (PRESENT(transpose_distribution)) mytranspose_distribution = transpose_distribution

      CALL dbcsr_new_transposed(transposed, normal, myshallow_data_copy, &
                                transpose_data, mytranspose_distribution, &
                                use_distribution=myuse_distribution)
   END SUBROUTINE dbcsr_transposed

   SUBROUTINE dbcsr_desymmetrize_deep(sm, desm, untransposed_data)
      !! Duplicates data in symmetric matrix to make it normal (w.r.t. data
      !! structure. Can handle symmetric/antisymmetric/hermitian/skewhermitian
      !! matrices

      TYPE(dbcsr_type), INTENT(IN)                       :: sm
         !! input symmetric matrix
      TYPE(dbcsr_type), INTENT(INOUT)                    :: desm
         !! desymmetrized matrix
      LOGICAL, INTENT(IN), OPTIONAL                      :: untransposed_data

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_desymmetrize_deep'
      INTEGER, PARAMETER                                 :: idata = 2, imeta = 1, metalen = 3

      INTEGER :: blk, blk_l, blk_p, blk_ps, blks, col, col_size, dst_p, handle, &
                 nsymmetries, numproc, nze, pcol, prow, row, row_size, src_p, stored_col, stored_row, &
                 symmetry_i
      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: rd_disp, recv_meta, rm_disp, sd_disp, &
                                                            sdp, send_meta, sm_disp, smp
      INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: recv_count, send_count, &
                                                            total_recv_count, total_send_count
      INTEGER, DIMENSION(:), POINTER                     :: col_blk_size, col_dist, row_blk_size, &
                                                            row_dist
      INTEGER, DIMENSION(:, :), POINTER                  :: blacs2mpi
      LOGICAL                                            :: make_untr, tr
      TYPE(dbcsr_data_obj)                               :: recv_data, send_data
      TYPE(dbcsr_distribution_obj)                       :: target_dist
      TYPE(dbcsr_iterator)                               :: iter
      TYPE(dbcsr_mp_obj)                                 :: mp_obj
      TYPE(mp_comm_type)                                 :: mp_group

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)
      IF (.NOT. dbcsr_valid_index(sm)) &
         DBCSR_ABORT("Input matrix is invalid.")
      IF (dbcsr_valid_index(desm)) THEN
         CALL dbcsr_release(desm)
      END IF
      CALL dbcsr_create(desm, sm, matrix_type="N")

      IF (PRESENT(untransposed_data)) THEN
         make_untr = untransposed_data
      ELSE
         make_untr = .FALSE.
      END IF
      nsymmetries = 1
      IF (sm%symmetry) THEN
         nsymmetries = 2
      END IF
      row_blk_size => array_data(sm%row_blk_size)
      col_blk_size => array_data(sm%col_blk_size)
      target_dist = sm%dist
      row_dist => dbcsr_distribution_row_dist(target_dist)
      col_dist => dbcsr_distribution_col_dist(target_dist)
      mp_obj = dbcsr_distribution_mp(target_dist)
      blacs2mpi => dbcsr_mp_pgrid(mp_obj)
      numproc = dbcsr_mp_numnodes(mp_obj)
      mp_group = dbcsr_mp_group(mp_obj)
      !
      ALLOCATE (send_count(2, 0:numproc - 1))
      ALLOCATE (recv_count(2, 0:numproc - 1))
      ALLOCATE (total_send_count(2, 0:numproc - 1))
      ALLOCATE (total_recv_count(2, 0:numproc - 1))
      ALLOCATE (sdp(0:numproc - 1))
      ALLOCATE (sd_disp(0:numproc - 1))
      ALLOCATE (smp(0:numproc - 1))
      ALLOCATE (sm_disp(0:numproc - 1))
      ALLOCATE (rd_disp(0:numproc - 1))
      ALLOCATE (rm_disp(0:numproc - 1))
      !
      desm%negate_real = sm%negate_real
      desm%negate_imaginary = sm%negate_imaginary
      ! Count initial sizes for sending.
      send_count(:, :) = 0
      CALL dbcsr_iterator_start(iter, sm)
      DO WHILE (dbcsr_iterator_blocks_left(iter))
         CALL dbcsr_iterator_next_block(iter, row, col, blk, &
                                        row_size=row_size, col_size=col_size)
         DO symmetry_i = 1, nsymmetries
            IF (symmetry_i .EQ. 1) THEN
               stored_row = row; stored_col = col
            ELSE
               IF (row .EQ. col) CYCLE
               stored_row = col; stored_col = row
            END IF
            ! Where do we send this block?
            prow = row_dist(stored_row)
            pcol = col_dist(stored_col)
            dst_p = blacs2mpi(prow, pcol)
            nze = row_size*col_size
            send_count(imeta, dst_p) = send_count(imeta, dst_p) + 1
            send_count(idata, dst_p) = send_count(idata, dst_p) + nze
         END DO ! symmetry_i
      END DO ! iter
      CALL dbcsr_iterator_stop(iter)
      !
      CALL mp_alltoall(send_count, recv_count, 2, mp_group)
      !
      ! Allocate data structures needed for data exchange.
      CALL dbcsr_data_init(recv_data)
      CALL dbcsr_data_new(recv_data, dbcsr_get_data_type(sm), &
                          SUM(recv_count(idata, :)))
      ALLOCATE (recv_meta(metalen*SUM(recv_count(imeta, :))))
      CALL dbcsr_data_init(send_data)
      CALL dbcsr_data_new(send_data, dbcsr_get_data_type(sm), &
                          SUM(send_count(idata, :)))
      ALLOCATE (send_meta(metalen*SUM(send_count(imeta, :))))
      !
      ! Fill in the meta data structures and copy the data.
      DO dst_p = 0, numproc - 1
         total_send_count(imeta, dst_p) = send_count(imeta, dst_p)
         total_send_count(idata, dst_p) = send_count(idata, dst_p)
         total_recv_count(imeta, dst_p) = recv_count(imeta, dst_p)
         total_recv_count(idata, dst_p) = recv_count(idata, dst_p)
      END DO
      sd_disp = -1; sm_disp = -1
      rd_disp = -1; rm_disp = -1
      sd_disp(0) = 1; sm_disp(0) = 1
      rd_disp(0) = 1; rm_disp(0) = 1
      DO dst_p = 1, numproc - 1
         sm_disp(dst_p) = sm_disp(dst_p - 1) &
                          + metalen*total_send_count(imeta, dst_p - 1)
         sd_disp(dst_p) = sd_disp(dst_p - 1) &
                          + total_send_count(idata, dst_p - 1)
         rm_disp(dst_p) = rm_disp(dst_p - 1) &
                          + metalen*total_recv_count(imeta, dst_p - 1)
         rd_disp(dst_p) = rd_disp(dst_p - 1) &
                          + total_recv_count(idata, dst_p - 1)
      END DO
      sdp(:) = sd_disp
      smp(:) = sm_disp
      !
      CALL dbcsr_iterator_start(iter, sm)
      DO WHILE (dbcsr_iterator_blocks_left(iter))
         CALL dbcsr_iterator_next_block(iter, row, col, blk, blk_p=blk_p, &
                                        row_size=row_size, col_size=col_size)
         DO symmetry_i = 1, nsymmetries
            IF (symmetry_i .EQ. 1) THEN
               stored_row = row; stored_col = col; tr = .FALSE.
            ELSE
               IF (row .EQ. col) CYCLE
               stored_row = col; stored_col = row; tr = .TRUE.
            END IF
            ! Where do we send this block?
            prow = row_dist(stored_row)
            pcol = col_dist(stored_col)
            dst_p = blacs2mpi(prow, pcol)
            nze = row_size*col_size
            send_meta(smp(dst_p)) = stored_row
            send_meta(smp(dst_p) + 1) = stored_col
            send_meta(smp(dst_p) + 2) = sdp(dst_p) - sd_disp(dst_p) + 1
            IF (make_untr) THEN
               CALL dbcsr_block_partial_copy(dst=send_data, &
                                             dst_rs=row_size, dst_cs=col_size, &
                                             dst_tr=symmetry_i .GT. 1, &
                                             dst_r_lb=1, dst_c_lb=1, dst_offset=sdp(dst_p) - 1, &
                                             nrow=row_size, ncol=col_size, &
                                             src=sm%data_area, &
                                             src_rs=row_size, src_cs=col_size, &
                                             src_tr=blk_p .LT. 0, &
                                             src_r_lb=1, src_c_lb=1, &
                                             src_offset=ABS(blk_p) - 1)
               IF (tr) THEN
                  IF (sm%negate_real) THEN
                     CALL dbcsr_block_real_neg(dst=send_data, row_size=row_size, &
                                               col_size=col_size, lb=sdp(dst_p))
                  END IF
                  IF (sm%negate_imaginary) &
                     CALL dbcsr_block_conjg(dst=send_data, row_size=row_size, &
                                            col_size=col_size, lb=sdp(dst_p))
               END IF
            ELSE
               CALL dbcsr_data_copy(send_data, (/sdp(dst_p)/), (/nze/), &
                                    sm%data_area, (/ABS(blk_p)/), (/nze/))
               IF (tr) &
                  send_meta(smp(dst_p) + 2) = -send_meta(smp(dst_p) + 2)
            END IF
            smp(dst_p) = smp(dst_p) + metalen
            sdp(dst_p) = sdp(dst_p) + nze
         END DO ! symmetry_i
      END DO ! iter
      CALL dbcsr_iterator_stop(iter)
      ! Exchange the data and metadata structures.
      CALL hybrid_alltoall_any(send_data, &
                               total_send_count(idata, :), sd_disp(:) - 1, &
                               recv_data, &
                               total_recv_count(idata, :), rd_disp(:) - 1, mp_obj)
      CALL mp_alltoall(send_meta(:), metalen*total_send_count(imeta, :), sm_disp(:) - 1, &
                       recv_meta(:), metalen*total_recv_count(imeta, :), rm_disp(:) - 1, &
                       mp_group)
      ! Now fill in the data.
      CALL dbcsr_work_create(desm, &
                             SUM(recv_count(imeta, :)), &
                             SUM(recv_count(idata, :)), n=1, &
                             work_mutable=.FALSE.)
      ! Switch data data area of the work matrix with the received data
      ! (avoids copying).
      CALL dbcsr_data_hold(recv_data)
      CALL dbcsr_data_release(desm%wms(1)%data_area)
      desm%wms(1)%data_area = recv_data
      !
      blk_ps = 1
      blks = 1
      ! WRITE(*,*)rm_disp
      ! WRITE(*,*)recv_count
      DO src_p = 0, numproc - 1
         IF (careful_mod) THEN
            IF ((blks - 1)*3 /= rm_disp(src_p) - 1) &
               DBCSR_ABORT("Count mismatch")
         END IF
         blks = (rm_disp(src_p) - 1)/metalen + 1
         DO blk_l = 1, recv_count(imeta, src_p)
            stored_row = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1))
            stored_col = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 1)
            blk_p = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 2)
            desm%wms(1)%row_i(blks) = stored_row
            desm%wms(1)%col_i(blks) = stored_col
            desm%wms(1)%blk_p(blks) = SIGN(ABS(blk_p) + (rd_disp(src_p) - 1), blk_p)
            nze = row_blk_size(ABS(stored_row)) &
                  *col_blk_size(stored_col)
            blk_ps = blk_ps + nze
            blks = blks + 1
         END DO
      END DO
      !
      desm%wms(1)%lastblk = blks - 1
      desm%wms(1)%datasize = blk_ps - 1
      CALL dbcsr_finalize(desm)
      DEALLOCATE (send_count)
      DEALLOCATE (recv_count)
      DEALLOCATE (sdp); DEALLOCATE (sd_disp)
      DEALLOCATE (smp); DEALLOCATE (sm_disp)
      DEALLOCATE (rd_disp)
      DEALLOCATE (rm_disp)
      CALL dbcsr_data_release(recv_data)
      DEALLOCATE (recv_meta)
      CALL dbcsr_data_release(send_data)
      DEALLOCATE (send_meta)
      CALL timestop(handle)
   END SUBROUTINE dbcsr_desymmetrize_deep

   SUBROUTINE dbcsr_distribute(matrix, fast)
      !! Distributes a matrix that is currently replicated.

      TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix
         !! matrix to replicate
      LOGICAL, INTENT(in), OPTIONAL                      :: fast
         !! change just the index, don't touch the data

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_distribute'

      COMPLEX(KIND=dp), DIMENSION(:), POINTER, CONTIGUOUS :: c_dp
      COMPLEX(KIND=sp), DIMENSION(:), POINTER, CONTIGUOUS :: c_sp
      INTEGER                                            :: blk, col, handle, mynode, nblks, nze, p, &
                                                            row
      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS         :: col_blk_size, row_blk_size, tmp_index
      LOGICAL                                            :: mini, tr
      REAL(KIND=dp), DIMENSION(:), POINTER, CONTIGUOUS   :: r_dp
      REAL(KIND=sp), DIMENSION(:), POINTER, CONTIGUOUS   :: r_sp
      TYPE(dbcsr_data_obj)                               :: tmp_data
      TYPE(dbcsr_distribution_obj)                       :: dist
      TYPE(dbcsr_iterator)                               :: iter
      TYPE(dbcsr_mp_obj)                                 :: mp_obj
      TYPE(dbcsr_type)                                   :: distributed

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)
      IF (.NOT. dbcsr_valid_index(matrix)) &
         DBCSR_ABORT("Matrix not initialized.")
      IF (matrix%replication_type .EQ. dbcsr_repl_none) &
         DBCSR_WARN("Distributing a non-replicated matrix makes no sense.")
      IF (PRESENT(fast)) THEN
         mini = fast
      ELSE
         mini = .FALSE.
      END IF
      SELECT CASE (matrix%data_type)
      CASE (dbcsr_type_real_8)
         CALL dbcsr_get_data(matrix%data_area, r_dp)
      CASE (dbcsr_type_real_4)
         CALL dbcsr_get_data(matrix%data_area, r_sp)
         DBCSR_ABORT("Only real double precision")
      CASE (dbcsr_type_complex_8)
         CALL dbcsr_get_data(matrix%data_area, c_dp)
         DBCSR_ABORT("Only real double precision")
      CASE (dbcsr_type_complex_4)
         CALL dbcsr_get_data(matrix%data_area, c_sp)
         DBCSR_ABORT("Only real double precision")
      END SELECT
      row_blk_size => array_data(matrix%row_blk_size)
      col_blk_size => array_data(matrix%col_blk_size)
      dist = dbcsr_distribution(matrix)
      mp_obj = dbcsr_distribution_mp(dist)
      mynode = dbcsr_mp_mynode(dbcsr_distribution_mp(dist))
      !
      IF (mini) THEN
         ! We just mark the blocks as deleted.
         CALL dbcsr_iterator_start(iter, matrix)
         DO WHILE (dbcsr_iterator_blocks_left(iter))
            CALL dbcsr_iterator_next_block(iter, row, col, r_dp, tr, blk)
            tr = .FALSE.
            CALL dbcsr_get_stored_coordinates(matrix, row, col, p)
            IF (mynode .EQ. p) THEN
               matrix%blk_p(blk) = 0
            END IF
         END DO
         CALL dbcsr_iterator_stop(iter)
         matrix%replication_type = dbcsr_repl_none
      ELSE
         CALL dbcsr_create(distributed, name='Distributed '//TRIM(matrix%name), &
                           template=matrix, &
                           matrix_type=dbcsr_type_no_symmetry, &
                           replication_type=dbcsr_repl_none)
         distributed%replication_type = dbcsr_repl_none
         ! First count how many blocks are local.
         nze = 0
         nblks = 0
         CALL dbcsr_iterator_start(iter, matrix)
         DO WHILE (dbcsr_iterator_blocks_left(iter))
            CALL dbcsr_iterator_next_block(iter, row, col, r_dp, tr, blk)
            tr = .FALSE.
            CALL dbcsr_get_stored_coordinates(matrix, row, col, p)
            IF (mynode .EQ. p) THEN
               nze = nze + row_blk_size(row)*col_blk_size(col)
               nblks = nblks + 1
            END IF
         END DO
         CALL dbcsr_iterator_stop(iter)
         ! Preallocate the array
         CALL dbcsr_work_create(distributed, nblks_guess=nblks, &
                                sizedata_guess=nze, work_mutable=.FALSE.)
         ! Now actually do the work
         CALL dbcsr_iterator_start(iter, matrix)
         DO WHILE (dbcsr_iterator_blocks_left(iter))
            CALL dbcsr_iterator_next_block(iter, row, col, r_dp, tr, blk)
            tr = .FALSE.
            CALL dbcsr_get_stored_coordinates(matrix, row, col, p)
            IF (mynode .EQ. p) THEN
               CALL dbcsr_put_block(distributed, row, col, r_dp, transposed=tr)
            END IF
         END DO
         CALL dbcsr_iterator_stop(iter)
         CALL dbcsr_finalize(distributed)
         ! Now replace the data and index
         CALL dbcsr_switch_data_area(matrix, distributed%data_area, &
                                     previous_data_area=tmp_data)
         CALL dbcsr_switch_data_area(distributed, tmp_data)
         CALL dbcsr_data_release(tmp_data)
         tmp_index => matrix%index
         matrix%index => distributed%index
         distributed%index => tmp_index
         CALL dbcsr_repoint_index(matrix)
         matrix%nze = distributed%nze
         matrix%nblks = distributed%nblks
         CALL dbcsr_release(distributed)
      END IF
      CALL timestop(handle)
   END SUBROUTINE dbcsr_distribute

   SUBROUTINE dbcsr_make_untransposed_blocks(matrix)
      !! Detransposes all blocks in a matrix

      TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix
         !! DBCSR matrix

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_make_untransposed_blocks'

      INTEGER                                            :: blk, col, col_size, handle, row, row_size
      INTEGER, DIMENSION(:), POINTER                     :: cbs, rbs
      LOGICAL                                            :: sym_negation, tr
      TYPE(dbcsr_data_obj)                               :: block_data
      TYPE(dbcsr_iterator)                               :: iter

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)
      rbs => dbcsr_row_block_sizes(matrix)
      cbs => dbcsr_col_block_sizes(matrix)
      sym_negation = matrix%negate_real

!$OMP PARALLEL DEFAULT(NONE) PRIVATE(block_data,iter,row,col,tr,blk,row_size,col_size) &
!$OMP          SHARED(matrix,rbs,cbs,sym_negation)
      CALL dbcsr_data_init(block_data)
      CALL dbcsr_data_new(block_data, dbcsr_get_data_type(matrix))
      CALL dbcsr_iterator_start(iter, matrix)
      DO WHILE (dbcsr_iterator_blocks_left(iter))
         CALL dbcsr_iterator_next_block(iter, row, col, block_data, &
                                        transposed=tr, &
                                        block_number=blk)
         IF (tr) THEN
            row_size = rbs(row)
            col_size = cbs(col)
            CALL dbcsr_block_transpose(block_data, col_size, row_size)
            IF (sym_negation) THEN
               SELECT CASE (block_data%d%data_type)
               CASE (dbcsr_type_real_4)
                  block_data%d%r_sp(:) = -block_data%d%r_sp(:)
               CASE (dbcsr_type_real_8)
                  block_data%d%r_dp(:) = -block_data%d%r_dp(:)
               CASE (dbcsr_type_complex_4)
                  block_data%d%c_sp(:) = -block_data%d%c_sp(:)
               CASE (dbcsr_type_complex_8)
                  block_data%d%c_dp(:) = -block_data%d%c_dp(:)
               END SELECT
            END IF
            matrix%blk_p(blk) = -matrix%blk_p(blk)
         END IF
      END DO
      CALL dbcsr_iterator_stop(iter)
      CALL dbcsr_data_clear_pointer(block_data)
      CALL dbcsr_data_release(block_data)
!$OMP END PARALLEL

      CALL timestop(handle)
   END SUBROUTINE dbcsr_make_untransposed_blocks

   SUBROUTINE dbcsr_make_dense(matrix, dense_matrix, dense_dist, &
                               dense_row_sizes, dense_col_sizes, row_map, col_map)
      !! Makes a dense matrix, inplace.
      !! @note Used for making matrices dense/undense

      TYPE(dbcsr_type), INTENT(IN)                       :: matrix
         !! matrix to make dense
      TYPE(dbcsr_type), INTENT(OUT)                      :: dense_matrix
      TYPE(dbcsr_distribution_obj), INTENT(INOUT)        :: dense_dist
      TYPE(array_i1d_obj), INTENT(IN)                    :: dense_row_sizes, dense_col_sizes, &
                                                            row_map, col_map

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_make_dense'
      LOGICAL, PARAMETER                                 :: dbg = .FALSE.

      INTEGER                                            :: handle
      REAL(kind=dp)                                      :: cs
      TYPE(array_i1d_obj)                                :: dense_local_cols, dense_local_rows
      TYPE(dbcsr_distribution_obj)                       :: old_distribution

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)
      CALL dbcsr_create(dense_matrix, template=matrix, &
                        dist=dense_dist, &
                        row_blk_size_obj=dense_row_sizes, &
                        col_blk_size_obj=dense_col_sizes)
      !
      IF (dbg) THEN
         cs = dbcsr_checksum(matrix)
         WRITE (*, *) routineN//" prod cs pre", cs
      END IF
      old_distribution = dbcsr_distribution(matrix)
      ! Conversion of global to local offsets for the dense blocks
      CALL dbcsr_get_local_rows(dense_dist, dense_local_rows, &
                                dense_matrix%index(dbcsr_slot_home_prow))
      CALL dbcsr_get_local_cols(dense_dist, dense_local_cols, &
                                dense_matrix%index(dbcsr_slot_home_pcol))
      !
      CALL dbcsr_make_dense_low(matrix, dense_matrix, &
                                dbcsr_distribution_local_rows(old_distribution), &
                                dbcsr_distribution_local_cols(old_distribution), &
                                array_data(matrix%row_blk_offset), &
                                array_data(matrix%col_blk_offset), &
                                array_data(dense_local_rows), array_data(dense_local_cols), &
                                array_data(dense_matrix%row_blk_offset), &
                                array_data(dense_matrix%col_blk_offset), &
                                array_data(row_map), array_data(col_map), .TRUE., .TRUE.)
      IF (dbg) THEN
         cs = dbcsr_checksum(dense_matrix)
         WRITE (*, *) routineN//" prod cs pst", cs
      END IF
      CALL timestop(handle)
   END SUBROUTINE dbcsr_make_dense

   SUBROUTINE dbcsr_make_dense_low(und_matrix, dense_matrix, &
                                   und_local_rows, und_local_cols, &
                                   und_row_blk_offsets, und_col_blk_offsets, &
                                   dense_local_rows, dense_local_cols, &
                                   dense_row_blk_offsets, dense_col_blk_offsets, &
                                   row_map, col_map, join_rows, join_cols)
      !! Copies a matrix and makes its data dense.
      !! @note the dense_matrix must have been created

      TYPE(dbcsr_type), INTENT(IN)                       :: und_matrix
         !! Original non-dense matrix
      TYPE(dbcsr_type), INTENT(INOUT)                    :: dense_matrix
         !! Dense copy of und_matrix
      INTEGER, DIMENSION(:), INTENT(IN) :: und_local_rows, und_local_cols, und_row_blk_offsets, &
                                           und_col_blk_offsets, dense_local_rows, dense_local_cols, dense_row_blk_offsets, &
                                           dense_col_blk_offsets, row_map, col_map
         !! The process-grid local rows of the non-dense und_matrix
         !! The process-grid local columns of the non-dense und_matrix
         !! The block offsets of the rows of the non-dense matrix
         !! The block offsets of the columns of the non-dense matrix
         !! The process-grid local rows of the dense matrix
         !! The process-grid local columns of the dense matrix
         !! The block offsets of the rows of the dense matrix
         !! The block offsets of the columns of the dense matrix
         !! Mapping of non-dense rows to dense rows
         !! Mapping of non-dense columns to dense columns
      LOGICAL, INTENT(IN)                                :: join_rows, join_cols
         !! Make rows dense
         !! Make columns dense

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_make_dense_low'

      INTEGER :: dense_nblkcols_local, dense_nblkcols_total, dense_nblkrows_local, &
                 dense_nblkrows_total, dense_nlocal_blocks, error_handle, nfullcols, nfullrows, &
                 und_nblkcols_total, und_nblkrows_total
      INTEGER, ALLOCATABLE, DIMENSION(:) :: col_internal_offsets, dense_local_col_blk_offsets, &
                                            dense_local_row_blk_offsets, row_internal_offsets, und_local_col_blk_offsets, &
                                            und_local_row_blk_offsets
      INTEGER, DIMENSION(dbcsr_meta_size)                :: meta
      TYPE(dbcsr_data_obj)                               :: dense_data, und_data

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, error_handle)
      !
      nfullrows = nfull_elements(und_row_blk_offsets, und_local_rows)
      nfullcols = nfull_elements(und_col_blk_offsets, und_local_cols)
      !
      und_nblkrows_total = SIZE(und_row_blk_offsets) - 1
      und_nblkcols_total = SIZE(und_col_blk_offsets) - 1
      !
      ! Find the local data offsets (but indexed by the global
      ! rows/columns) for the undense data.
      ALLOCATE (und_local_row_blk_offsets(und_nblkrows_total + 1))
      ALLOCATE (und_local_col_blk_offsets(und_nblkcols_total + 1))
      CALL global_offsets_to_local( &
         und_row_blk_offsets, &
         und_local_rows, &
         und_local_row_blk_offsets)
      CALL global_offsets_to_local( &
         und_col_blk_offsets, &
         und_local_cols, &
         und_local_col_blk_offsets)
      !
      dense_nblkrows_total = SIZE(dense_row_blk_offsets) - 1
      dense_nblkcols_total = SIZE(dense_col_blk_offsets) - 1
      dense_nblkrows_local = SIZE(dense_local_rows)
      dense_nblkcols_local = SIZE(dense_local_cols)
      !
      ! Find the local data offsets (but indexed by the (dense) global
      ! rows/columns) for the dense data.
      ALLOCATE (dense_local_row_blk_offsets(dense_nblkrows_total + 1))
      ALLOCATE (dense_local_col_blk_offsets(dense_nblkcols_total + 1))
      CALL global_offsets_to_local( &
         dense_row_blk_offsets, &
         dense_local_rows, &
         dense_local_row_blk_offsets)
      CALL global_offsets_to_local( &
         dense_col_blk_offsets, &
         dense_local_cols, &
         dense_local_col_blk_offsets)
      ! Find the offset of blocks within dense rows/columns.  This is needed
      ! since the blocked rows/columns are not necessarily in the same order.
      ALLOCATE (row_internal_offsets(und_nblkrows_total))
      ALLOCATE (col_internal_offsets(und_nblkcols_total))
      CALL get_internal_offsets( &
         und_local_rows, row_map, &
         und_local_row_blk_offsets, &
         dense_local_row_blk_offsets, &
         row_internal_offsets)
      CALL get_internal_offsets( &
         und_local_cols, col_map, &
         und_local_col_blk_offsets, &
         dense_local_col_blk_offsets, &
         col_internal_offsets)
      !
      und_data = und_matrix%data_area
      CALL dbcsr_data_hold(und_data)
      CALL dbcsr_data_init(dense_data)
      CALL dbcsr_data_new(dense_data, dbcsr_data_get_type(und_data), &
                          data_size=nfullrows*nfullcols, &
                          memory_type=dbcsr_data_get_memory_type(und_data))
      !
      ! Reshuffle the data
      CALL make_dense_data(und_matrix, &
                           dense_data, nfullrows, nfullcols, &
                           und_local_row_blk_offsets, und_local_col_blk_offsets, &
                           dense_local_row_blk_offsets, dense_local_col_blk_offsets, &
                           row_map=row_map, col_map=col_map, &
                           row_internal_offsets=row_internal_offsets, &
                           col_internal_offsets=col_internal_offsets, &
                           join_rows=join_rows, join_cols=join_cols, make_tr=.FALSE.)
      CALL dbcsr_switch_data_area(dense_matrix, dense_data)
      CALL dbcsr_data_release(dense_data)
      CALL dbcsr_data_release(und_data)
      !
      ! Create the new dense index.
      dense_nlocal_blocks = dense_nblkrows_local*dense_nblkcols_local
      CALL dbcsr_addto_index_array(dense_matrix, &
                                   dbcsr_slot_row_p, &
                                   reservation=dense_nblkrows_total + 1, extra=2*dense_nlocal_blocks)
      CALL dbcsr_addto_index_array(dense_matrix, &
                                   dbcsr_slot_col_i, &
                                   reservation=dense_nlocal_blocks)
      CALL dbcsr_addto_index_array(dense_matrix, &
                                   dbcsr_slot_blk_p, &
                                   reservation=dense_nlocal_blocks)
      !
      meta = dense_matrix%index(1:dbcsr_meta_size)
      CALL dbcsr_pack_meta(dense_matrix, meta)
      meta(dbcsr_slot_nze) = nfullrows*nfullcols
      meta(dbcsr_slot_nblks) = dense_nlocal_blocks
      CALL make_dense_index(dense_matrix%row_p, &
                            dense_matrix%col_i, &
                            dense_matrix%blk_p, &
                            dense_nblkrows_total, dense_nblkcols_total, &
                            dense_local_rows, &
                            dense_local_cols, &
                            dense_local_row_blk_offsets, &
                            dense_local_col_blk_offsets, &
                            make_tr=.FALSE., &
                            meta=meta)
      CALL dbcsr_unpack_meta(dense_matrix, meta)
      !
      DEALLOCATE (und_local_row_blk_offsets)
      DEALLOCATE (und_local_col_blk_offsets)
      DEALLOCATE (dense_local_row_blk_offsets)
      DEALLOCATE (dense_local_col_blk_offsets)
      DEALLOCATE (row_internal_offsets)
      DEALLOCATE (col_internal_offsets)
      !
      CALL timestop(error_handle)
   END SUBROUTINE dbcsr_make_dense_low

   SUBROUTINE dbcsr_make_undense(matrix, undense_matrix, distribution, &
                                 row_blk_offsets, col_blk_offsets, row_blk_sizes, col_blk_sizes, &
                                 row_map, col_map)
      !! Makes a blocked matrix from a dense matrix, inplace
      !! @note Used for making matrices dense/undense

      TYPE(dbcsr_type), INTENT(IN)                       :: matrix
         !! dense matrix
      TYPE(dbcsr_type), INTENT(INOUT)                    :: undense_matrix
         !! matrix to make undense
      TYPE(dbcsr_distribution_obj), INTENT(IN)           :: distribution
         !! distribution of non-dense rows and columns
      TYPE(array_i1d_obj), INTENT(IN)                    :: row_blk_offsets, col_blk_offsets, &
                                                            row_blk_sizes, col_blk_sizes, row_map, &
                                                            col_map
         !! non-dense row block offsets
         !! non-dense column block offsets
         !! non-dense row block sizes
         !! non-dense column block sizes
         !! mapping from non-dense rows
         !! mapping from non-dense columns

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_make_undense'
      LOGICAL, PARAMETER                                 :: dbg = .FALSE.

      INTEGER                                            :: handle, nblkcols_local, nblkcols_total, &
                                                            nblkrows_local, nblkrows_total, &
                                                            nfullcols_local, nfullrows_local
      INTEGER, ALLOCATABLE, DIMENSION(:) :: col_internal_offsets, dense_local_col_blk_offsets, &
                                            dense_local_row_blk_offsets, local_col_blk_offsets, local_row_blk_offsets, &
                                            row_internal_offsets
      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS         :: local_cols, local_rows, meta
      REAL(kind=dp)                                      :: cs
      TYPE(dbcsr_data_obj)                               :: blocked_data, dense_data
      TYPE(dbcsr_distribution_obj)                       :: dense_distribution

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)
      IF (dbg) THEN
         cs = dbcsr_checksum(matrix)
         WRITE (*, *) routineN//" prod cs pre", cs
      END IF
      dense_distribution = dbcsr_distribution(matrix)
      nfullrows_local = matrix%nfullrows_local
      nfullcols_local = matrix%nfullcols_local
      nblkrows_local = dbcsr_distribution_nlocal_rows(distribution)
      nblkcols_local = dbcsr_distribution_nlocal_cols(distribution)
      nblkrows_total = dbcsr_distribution_nrows(distribution)
      nblkcols_total = dbcsr_distribution_ncols(distribution)
      local_rows => dbcsr_distribution_local_rows(distribution)
      local_cols => dbcsr_distribution_local_cols(distribution)
      CALL dbcsr_create(undense_matrix, template=matrix, &
                        dist=distribution, &
                        row_blk_size_obj=row_blk_sizes, &
                        col_blk_size_obj=col_blk_sizes)
      ! Restore previous offsets, just to try to keep the same memory.
      CALL array_release(undense_matrix%row_blk_offset)
      CALL array_release(undense_matrix%col_blk_offset)
      undense_matrix%row_blk_offset = row_blk_offsets
      undense_matrix%col_blk_offset = col_blk_offsets
      CALL array_hold(undense_matrix%row_blk_offset)
      CALL array_hold(undense_matrix%col_blk_offset)
      !
      ALLOCATE (local_row_blk_offsets(nblkrows_total + 1))
      ALLOCATE (local_col_blk_offsets(nblkcols_total + 1))
      CALL dbcsr_clearfrom_index_array(undense_matrix, dbcsr_slot_row_p)
      CALL dbcsr_clearfrom_index_array(undense_matrix, dbcsr_slot_col_i)
      CALL dbcsr_clearfrom_index_array(undense_matrix, dbcsr_slot_blk_p)
      CALL dbcsr_addto_index_array(undense_matrix, dbcsr_slot_row_p, &
                                   reservation=nblkrows_total + 1, extra=nblkrows_local*nblkcols_local*2)
      CALL dbcsr_addto_index_array(undense_matrix, dbcsr_slot_col_i, &
                                   reservation=nblkrows_local*nblkcols_local)
      CALL dbcsr_addto_index_array(undense_matrix, dbcsr_slot_blk_p, &
                                   reservation=nblkrows_local*nblkcols_local)
      meta => undense_matrix%index(1:dbcsr_meta_size)
      CALL dbcsr_pack_meta(undense_matrix, meta)
      meta(dbcsr_slot_nblks) = nblkrows_local*nblkcols_local
      meta(dbcsr_slot_nze) = nfullrows_local*nfullcols_local
      CALL global_offsets_to_local(array_data(row_blk_offsets), &
                                   local_rows, local_row_blk_offsets(1:nblkrows_local + 1))
      CALL global_offsets_to_local(array_data(col_blk_offsets), &
                                   local_cols, local_col_blk_offsets(1:nblkcols_local + 1))
      CALL make_undense_index(undense_matrix%row_p, undense_matrix%col_i, undense_matrix%blk_p, &
                              distribution, &
                              local_row_blk_offsets(1:nblkrows_local + 1), &
                              local_col_blk_offsets(1:nblkcols_local + 1), &
                              meta)
      CALL dbcsr_unpack_meta(undense_matrix, meta)
      !
      CALL global_offsets_to_local(array_data(row_blk_offsets), &
                                   local_rows, local_row_blk_offsets)
      CALL global_offsets_to_local(array_data(col_blk_offsets), &
                                   local_cols, local_col_blk_offsets)
      !
      ALLOCATE (dense_local_row_blk_offsets(1 + dbcsr_distribution_nrows(dense_distribution)))
      ALLOCATE (dense_local_col_blk_offsets(1 + dbcsr_distribution_ncols(dense_distribution)))
      CALL global_offsets_to_local(array_data(matrix%row_blk_offset), &
                                   dbcsr_distribution_local_rows(dense_distribution), &
                                   dense_local_row_blk_offsets)
      CALL global_offsets_to_local(array_data(matrix%col_blk_offset), &
                                   dbcsr_distribution_local_cols(dense_distribution), &
                                   dense_local_col_blk_offsets)
      ! Find the offset of blocks within dense rows/columns.  This is needed
      ! since the blocked rows/columns are not necessarily in the same order.
      ALLOCATE (row_internal_offsets(nblkrows_total))
      ALLOCATE (col_internal_offsets(nblkcols_total))
      CALL get_internal_offsets( &
         local_rows, array_data(row_map), &
         local_row_blk_offsets, &
         dense_local_row_blk_offsets, &
         row_internal_offsets)
      CALL get_internal_offsets( &
         local_cols, array_data(col_map), &
         local_col_blk_offsets, &
         dense_local_col_blk_offsets, &
         col_internal_offsets)
      !
      dense_data = matrix%data_area
      CALL dbcsr_data_hold(dense_data)
      CALL dbcsr_data_init(blocked_data)
      CALL dbcsr_data_new(blocked_data, dbcsr_data_get_type(dense_data), &
                          data_size=nfullrows_local*nfullcols_local, &
                          memory_type=dbcsr_data_get_memory_type(dense_data))
      CALL dbcsr_switch_data_area(undense_matrix, blocked_data)
      CALL dbcsr_data_release(blocked_data)
      ! Reshuffle the data
      CALL make_undense_data(undense_matrix, dense_data, &
                             nfullrows_local, nfullcols_local, &
                             dense_local_row_blk_offsets, dense_local_col_blk_offsets, &
                             array_data(row_map), array_data(col_map), &
                             row_internal_offsets, col_internal_offsets)
      CALL dbcsr_data_release(dense_data)
      IF (dbg) THEN
         cs = dbcsr_checksum(matrix)
         WRITE (*, *) routineN//" prod cs pst", cs
      END IF
      DEALLOCATE (local_row_blk_offsets)
      DEALLOCATE (local_col_blk_offsets)
      DEALLOCATE (dense_local_row_blk_offsets)
      DEALLOCATE (dense_local_col_blk_offsets)
      DEALLOCATE (row_internal_offsets)
      DEALLOCATE (col_internal_offsets)
      CALL timestop(handle)
   END SUBROUTINE dbcsr_make_undense

   SUBROUTINE make_dense_data(matrix, dense_data, nfullrows, nfullcols, &
                              und_row_blk_offsets, und_col_blk_offsets, &
                              dense_row_blk_offsets, dense_col_blk_offsets, &
                              row_map, col_map, &
                              row_internal_offsets, col_internal_offsets, &
                              join_rows, join_cols, make_tr)
      !! Shuffles the data from blocked to standard dense form
      !! @note Used for making matrices dense/undense

      TYPE(dbcsr_type), INTENT(IN)                       :: matrix
         !! Existing blocked matrix
      TYPE(dbcsr_data_obj), INTENT(INOUT)                :: dense_data
         !! Dense data
      INTEGER, INTENT(IN)                                :: nfullrows, nfullcols
         !! size of new data
         !! size of new data
      INTEGER, DIMENSION(:), INTENT(IN) :: und_row_blk_offsets, und_col_blk_offsets, &
                                           dense_row_blk_offsets, dense_col_blk_offsets, row_map, col_map, row_internal_offsets, &
                                           col_internal_offsets
      LOGICAL, INTENT(IN)                                :: join_rows, join_cols, make_tr
         !! make rows dense, default is yes
         !! make columns dense, default is yes
         !! make the dense blocks transposed

      CHARACTER(len=*), PARAMETER :: routineN = 'make_dense_data'

      INTEGER :: blk_col, blk_col_size, blk_row, blk_row_size, dense_col, dense_row, error_handle, &
                 target_col_offset, target_cs, target_offset, target_row_offset, target_rs, tco, tro
      LOGICAL                                            :: tr
      TYPE(dbcsr_data_obj)                               :: block
      TYPE(dbcsr_iterator)                               :: iter

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, error_handle)
      IF (dbcsr_data_get_size(dense_data) < nfullrows*nfullcols) &
         DBCSR_ABORT("Dense data too small")
      IF (.NOT. join_cols .AND. .NOT. join_rows) &
         DBCSR_WARN("Joining neither rows nor columns is untested")
      !
      CALL dbcsr_data_clear(dense_data)
      IF (dbcsr_data_get_size(matrix%data_area) .GT. 0 &
          .AND. nfullrows .GT. 0 .AND. nfullcols .GT. 0) THEN
!$OMP PARALLEL DEFAULT(NONE) &
!$OMP PRIVATE (block, iter, &
!$OMP         target_rs, target_cs, blk_row, blk_col, tr, blk_row_size, blk_col_size,&
!$OMP         tro, tco, target_offset,&
!$OMP         target_row_offset, target_col_offset,&
!$OMP         dense_row, dense_col) &
!$OMP SHARED (&
!$OMP         dense_data, matrix, &
!$OMP         make_tr, join_rows, join_cols, &
!$OMP         und_row_blk_offsets, und_col_blk_offsets,&
!$OMP         dense_row_blk_offsets, dense_col_blk_offsets,&
!$OMP         row_internal_offsets, col_internal_offsets,&
!$OMP         row_map, col_map,&
!$OMP         nfullrows, nfullcols)
         CALL dbcsr_data_init(block)
         CALL dbcsr_data_new(block, &
                             dbcsr_type_1d_to_2d(dbcsr_data_get_type(dense_data)))
         CALL dbcsr_iterator_start(iter, matrix, dynamic=.TRUE., shared=.TRUE., &
                                   contiguous_pointers=.FALSE., read_only=.TRUE.)
         DO WHILE (dbcsr_iterator_blocks_left(iter))
            CALL dbcsr_iterator_next_block(iter, blk_row, blk_col, block, tr, &
                                           row_size=blk_row_size, col_size=blk_col_size)
            dense_row = row_map(blk_row)
            dense_col = col_map(blk_col)
            !
            ! Calculate the target block row/column size and the offset
            ! within the target block where the undense block is placed.
            IF (join_rows) THEN
               target_row_offset = dense_row_blk_offsets(dense_row)
               target_rs = dense_row_blk_offsets(dense_row + 1) - &
                           dense_row_blk_offsets(dense_row)
               tro = 1 + row_internal_offsets(blk_row)
            ELSE
               target_row_offset = und_row_blk_offsets(blk_row)
               target_rs = blk_row_size
               tro = 1
            END IF
            IF (join_cols) THEN
               target_col_offset = dense_col_blk_offsets(dense_col)
               target_cs = dense_col_blk_offsets(dense_col + 1) - &
                           dense_col_blk_offsets(dense_col)
               tco = 1 + col_internal_offsets(blk_col)
            ELSE
               target_col_offset = und_col_blk_offsets(blk_col)
               target_cs = blk_col_size
               tco = 1
            END IF
            target_offset = (target_row_offset - 1)*nfullcols &
                            + (target_col_offset - 1)*( &
                            dense_row_blk_offsets(dense_row + 1) - &
                            dense_row_blk_offsets(dense_row))
            CALL dbcsr_block_partial_copy(dst=dense_data, &
                                          dst_offset=target_offset, &
                                          dst_rs=target_rs, dst_cs=target_cs, dst_tr=make_tr, &
                                          dst_r_lb=tro, dst_c_lb=tco, &
                                          src=block, src_rs=blk_row_size, src_cs=blk_col_size, src_tr=tr, &
                                          src_r_lb=1, src_c_lb=1, nrow=blk_row_size, ncol=blk_col_size)
         END DO
         CALL dbcsr_iterator_stop(iter)
         CALL dbcsr_data_clear_pointer(block)
         CALL dbcsr_data_release(block)
!$OMP END PARALLEL
      END IF
      CALL timestop(error_handle)
   END SUBROUTINE make_dense_data

   SUBROUTINE make_undense_data(matrix, dense_data, nfullrows, nfullcols, &
                                dense_row_blk_offsets, dense_col_blk_offsets, &
                                row_map, col_map, row_internal_offsets, col_internal_offsets)
      !! Shuffles the data from standard dense to blocked form
      !! @note Used for making matrices dense/undense

      TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix
         !! Matrix with data to fill
      TYPE(dbcsr_data_obj), INTENT(IN)                   :: dense_data
         !! Dense data
      INTEGER, INTENT(IN)                                :: nfullrows, nfullcols
         !! number of full rows in local submatrix
         !! number of full columns in local submatrix
      INTEGER, DIMENSION(:), INTENT(IN)                  :: dense_row_blk_offsets, &
                                                            dense_col_blk_offsets, row_map, &
                                                            col_map, row_internal_offsets, &
                                                            col_internal_offsets
         !! row block offsets for dense data
         !! column block offsets for dense data
         !! mapping from undense to dense rows
         !! mapping from undense to dense rows

      INTEGER :: blk_col, blk_col_size, blk_row, blk_row_size, dense_col, dense_col_offset, &
                 dense_cs, dense_offset, dense_row, dense_row_offset, dense_rs, sco, sro
      LOGICAL                                            :: tr
      TYPE(dbcsr_data_obj)                               :: block
      TYPE(dbcsr_iterator)                               :: iter

!   ---------------------------------------------------------------------------

      IF (dbcsr_data_get_size(dense_data) < nfullrows*nfullcols) &
         DBCSR_ABORT("Dense data too small")
      IF (dbcsr_data_get_size(matrix%data_area) .GT. 0) THEN
         CALL dbcsr_data_clear(matrix%data_area)
!$OMP PARALLEL DEFAULT(NONE) &
!$OMP PRIVATE (block, iter,&
!$OMP          blk_row, blk_col, tr,&
!$OMP          blk_row_size, blk_col_size, sro, sco,&
!$OMP          dense_row_offset, dense_col_offset, dense_row, dense_col,&
!$OMP          dense_cs, dense_rs,&
!$OMP          dense_offset) &
!$OMP SHARED (&
!$OMP         matrix, dense_data, &
!$OMP         nfullrows, nfullcols, &
!$OMP         dense_row_blk_offsets, dense_col_blk_offsets,&
!$OMP         row_map, col_map,&
!$OMP         row_internal_offsets, col_internal_offsets)
         CALL dbcsr_data_init(block)
         CALL dbcsr_data_new(block, &
                             dbcsr_type_1d_to_2d(dbcsr_data_get_type(dense_data)))
         CALL dbcsr_iterator_start(iter, matrix, dynamic=.TRUE., shared=.TRUE., &
                                   contiguous_pointers=.FALSE.)
         DO WHILE (dbcsr_iterator_blocks_left(iter))
            CALL dbcsr_iterator_next_block(iter, blk_row, blk_col, block, tr, &
                                           row_size=blk_row_size, col_size=blk_col_size)
            dense_row = row_map(blk_row)
            dense_col = col_map(blk_col)
            dense_row_offset = dense_row_blk_offsets(dense_row)
            dense_col_offset = dense_col_blk_offsets(dense_col)
            dense_rs = dense_row_blk_offsets(dense_row + 1) - &
                       dense_row_blk_offsets(dense_row)
            dense_cs = dense_col_blk_offsets(dense_col + 1) - &
                       dense_col_blk_offsets(dense_col)
            sro = 1 + row_internal_offsets(blk_row)
            sco = 1 + col_internal_offsets(blk_col)
            dense_offset = (dense_row_offset - 1)*nfullcols &
                           + (dense_col_offset - 1)*dense_rs
            CALL dbcsr_block_partial_copy( &
               dst=block, dst_rs=blk_row_size, dst_cs=blk_col_size, dst_tr=tr, &
               dst_r_lb=1, dst_c_lb=1, &
               src=dense_data, src_offset=dense_offset, &
               src_rs=dense_rs, src_cs=dense_cs, src_tr=.FALSE., &
               src_r_lb=sro, src_c_lb=sco, &
               nrow=blk_row_size, ncol=blk_col_size)
         END DO
         CALL dbcsr_iterator_stop(iter)
         CALL dbcsr_data_clear_pointer(block)
         CALL dbcsr_data_release(block)
!$OMP END PARALLEL
      END IF
   END SUBROUTINE make_undense_data

   SUBROUTINE dbcsr_replicate_all(matrix)
      !! Replicates a DBCSR on all processors.

      TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix
         !! matrix to replicate

      CALL dbcsr_replicate(matrix, replicate_rows=.TRUE., &
                           replicate_columns=.TRUE.)
   END SUBROUTINE dbcsr_replicate_all

   SUBROUTINE dbcsr_replicate(matrix, replicate_rows, replicate_columns, &
                              restrict_source)
      !! Replicates a DBCSR matrix among process rows and columns
      !!
      !! Direction definition
      !! Row replication means that all processors in a process grid sharing
      !! the same row get the data of the entire row. (In a 1-column grid the
      !! operation has no effect.) Similar logic applies to column replication.

      TYPE(dbcsr_type), INTENT(INOUT)                    :: matrix
         !! matrix to replicate
      LOGICAL, INTENT(IN)                                :: replicate_rows, replicate_columns
         !! Row should be replicated among all processors
         !! Column should be replicated among all processors
      INTEGER, INTENT(IN), OPTIONAL                      :: restrict_source
         !! Send only from this node (ignores blocks on other nodes)

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_replicate'
      INTEGER, PARAMETER                                 :: metalen = 3
      LOGICAL, PARAMETER                                 :: dbg = .FALSE.

      CHARACTER                                          :: rep_type
      INTEGER :: blk, blk_l, blk_p, blk_ps, blks, col, col_size, data_type, dst_p, handle, &
                 mynode, mypcol, myprow, nblks, numnodes, nze, offset, row, row_size, smp, &
                 src_p, stored_blk_p, stored_col, stored_row
      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: rd_disp, recv_meta, rm_disp, send_meta
      INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: recv_count
      INTEGER, DIMENSION(2)                              :: send_count
      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS         :: col_blk_size, col_dist, row_blk_size, &
                                                            row_dist, tmp_index
      INTEGER, DIMENSION(:, :), POINTER                  :: blacs2mpi
      LOGICAL                                            :: had_subcomms, i_am_restricted, rest_src, &
                                                            tr
      TYPE(dbcsr_data_obj)                               :: data_block, recv_data, send_data, &
                                                            tmp_data
      TYPE(dbcsr_distribution_obj)                       :: target_dist
      TYPE(dbcsr_iterator)                               :: iter
      TYPE(dbcsr_mp_obj)                                 :: mp_obj
      TYPE(dbcsr_type)                                   :: replicated
      TYPE(mp_comm_type)                                 :: mp_group

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)
      IF (.NOT. dbcsr_valid_index(matrix)) &
         DBCSR_ABORT("Matrix not initialized.")
      !IF(matrix%replication_type /= dbcsr_repl_none) &
      !   DBCSR_WARN("Replicating a non-distributed matrix makes no sense.")
      IF (replicate_rows .AND. replicate_columns) THEN
         rep_type = dbcsr_repl_full
      ELSEIF (replicate_rows .AND. .NOT. replicate_columns) THEN
         rep_type = dbcsr_repl_row
      ELSEIF (replicate_columns .AND. .NOT. replicate_rows) THEN
         rep_type = dbcsr_repl_col
      ELSE
         rep_type = dbcsr_repl_none
         IF (.NOT. replicate_rows .AND. .NOT. replicate_columns) &
            DBCSR_ABORT("Some replication must be specified")
      END IF
      data_type = dbcsr_get_data_type(matrix)
      row_blk_size => array_data(matrix%row_blk_size)
      col_blk_size => array_data(matrix%col_blk_size)
      target_dist = matrix%dist
      row_dist => dbcsr_distribution_row_dist(target_dist)
      col_dist => dbcsr_distribution_col_dist(target_dist)
      mp_obj = dbcsr_distribution_mp(target_dist)
      blacs2mpi => dbcsr_mp_pgrid(mp_obj)
      numnodes = dbcsr_mp_numnodes(mp_obj)
      mynode = dbcsr_mp_mynode(mp_obj)
      myprow = dbcsr_mp_myprow(mp_obj)
      mypcol = dbcsr_mp_mypcol(mp_obj)
      IF (MAXVAL(row_dist) .GT. UBOUND(blacs2mpi, 1)) &
         DBCSR_ABORT('Row distribution references unexistent processor rows')
      IF (dbg) THEN
         IF (MAXVAL(row_dist) .NE. UBOUND(blacs2mpi, 1)) &
            DBCSR_WARN('Range of row distribution not equal to processor rows')
      END IF
      IF (MAXVAL(col_dist) .GT. UBOUND(blacs2mpi, 2)) &
         DBCSR_ABORT('Col distribution references unexistent processor cols')
      IF (dbg) THEN
         IF (MAXVAL(col_dist) .NE. UBOUND(blacs2mpi, 2)) &
            DBCSR_WARN('Range of col distribution not equal to processor cols')
      END IF
      ! Define the number of nodes with which I will communicate. Also
      ! setup row and column communicators.
      had_subcomms = dbcsr_mp_has_subgroups(mp_obj)
      SELECT CASE (rep_type)
      CASE (dbcsr_repl_full)
         numnodes = dbcsr_mp_numnodes(mp_obj)
         mp_group = dbcsr_mp_group(mp_obj)
         mynode = dbcsr_mp_mynode(mp_obj)
      CASE (dbcsr_repl_row)
         numnodes = dbcsr_mp_npcols(mp_obj)
         CALL dbcsr_mp_grid_setup(mp_obj)
         mp_group = dbcsr_mp_my_row_group(mp_obj)
         mynode = dbcsr_mp_mypcol(mp_obj)
      CASE (dbcsr_repl_col)
         numnodes = dbcsr_mp_nprows(mp_obj)
         CALL dbcsr_mp_grid_setup(mp_obj)
         mp_group = dbcsr_mp_my_col_group(mp_obj)
         mynode = dbcsr_mp_myprow(mp_obj)
      CASE (dbcsr_repl_none)
         numnodes = 1
         mp_group = dbcsr_mp_group(mp_obj)
         mynode = 0
      END SELECT
      IF ((rep_type == dbcsr_repl_row .OR. rep_type == dbcsr_repl_col) .AND. .NOT. dbcsr_mp_has_subgroups(mp_obj)) &
         DBCSR_ABORT("Only full replication supported when subcommunicators are turned off.")
      !
      IF (PRESENT(restrict_source)) THEN
         rest_src = .TRUE.
         i_am_restricted = mynode .NE. restrict_source
      ELSE
         rest_src = .FALSE.
         i_am_restricted = .FALSE.
      END IF
      !
      ALLOCATE (recv_count(2, 0:numnodes - 1))
      ALLOCATE (rd_disp(0:numnodes - 1))
      ALLOCATE (rm_disp(0:numnodes - 1))
      CALL dbcsr_create(replicated, name='Replicated '//TRIM(matrix%name), &
                        template=matrix, &
                        matrix_type=dbcsr_type_no_symmetry, &
                        replication_type=rep_type)
      !
      ! Count initial sizes for sending. Also, ensure that blocks are on their
      ! home processors.
      send_count(1:2) = 0
      CALL dbcsr_iterator_start(iter, matrix)
      IF (.NOT. i_am_restricted) THEN
         DO WHILE (dbcsr_iterator_blocks_left(iter))
            CALL dbcsr_iterator_next_block(iter, row, col, &
                                           row_size=row_size, col_size=col_size, blk=blk)
            !tr = .FALSE.
            !CALL dbcsr_get_stored_coordinates (matrix, row, col, tr, dst_p)
            !IF(dst_p /= mynode) &
            !   DBCSR_ABORT("Matrix is not correctly distributed. Call dbcsr_redistribute.")
            nze = row_size*col_size
            send_count(1) = send_count(1) + 1
            send_count(2) = send_count(2) + nze
         END DO
         send_count(2) = dbcsr_data_get_size_referenced(matrix%data_area)
      END IF
      CALL dbcsr_iterator_stop(iter)
      ! Exchange how much data others have.
      CALL mp_allgather(send_count(1:2), recv_count, mp_group)
      CALL dbcsr_data_init(recv_data)
      nze = SUM(recv_count(2, :))
      nblks = SUM(recv_count(1, :))
      CALL dbcsr_data_new(recv_data, data_type=data_type, data_size=nze)
      ! send_data should have the correct size
      CALL dbcsr_data_init(send_data)
      IF (send_count(2) .EQ. 0) THEN
         CALL dbcsr_data_new(send_data, data_type=data_type, data_size=0)
      ELSE
         CALL dbcsr_data_new(send_data, data_type=data_type)
         send_data = pointer_view(send_data, matrix%data_area, 1, send_count(2))
      END IF
      ALLOCATE (recv_meta(metalen*nblks))
      ALLOCATE (send_meta(metalen*send_count(1)))
      recv_meta(:) = 0
      ! Fill in the meta data structures and copy the data.
      rd_disp = -1; rm_disp = -1
      rd_disp(0) = 1; rm_disp(0) = 1
      DO dst_p = 1, numnodes - 1
         rm_disp(dst_p) = rm_disp(dst_p - 1) &
                          + metalen*recv_count(1, dst_p - 1)
         rd_disp(dst_p) = rd_disp(dst_p - 1) &
                          + recv_count(2, dst_p - 1)
      END DO
      CALL dbcsr_data_init(data_block)
      CALL dbcsr_data_new(data_block, data_type=data_type)
      CALL dbcsr_iterator_start(iter, matrix)
      smp = 1
      IF (.NOT. i_am_restricted) THEN
         DO WHILE (dbcsr_iterator_blocks_left(iter))
            CALL dbcsr_iterator_next_block(iter, row, col, blk, &
                                           transposed=tr, blk_p=blk_p)
            send_meta(smp + 0) = row
            send_meta(smp + 1) = col
            send_meta(smp + 2) = blk_p
            smp = smp + metalen
         END DO
      END IF
      CALL dbcsr_iterator_stop(iter)
      CALL dbcsr_data_clear_pointer(data_block)
      CALL dbcsr_data_release(data_block)
      ! Exchange the data and metadata structures.
      CALL mp_allgather(send_meta, recv_meta, metalen*recv_count(1, :), &
                        rm_disp - 1, mp_group)
      CALL dbcsr_allgatherv(send_data, dbcsr_data_get_size(send_data), &
                            recv_data, recv_count(2, :), &
                            rd_disp - 1, mp_group)
      ! Release the send buffer. If it had a non-zero size then it was a
      ! pointer into the regular matrix and the data pointer should be
      ! cleared and not deallocated.
      IF (send_count(2) .NE. 0) THEN
         CALL dbcsr_data_clear_pointer(send_data)
      END IF
      CALL dbcsr_data_release(send_data)
      !
      ! Now fill in the data.
      CALL dbcsr_work_create(replicated, &
                             SUM(recv_count(1, :)), &
                             SUM(recv_count(2, :)), n=1, &
                             work_mutable=.FALSE.)
      CALL dbcsr_data_hold(recv_data)
      CALL dbcsr_data_release(replicated%wms(1)%data_area)
      replicated%wms(1)%data_area = recv_data
      blk_ps = 1
      blks = 1
      DO src_p = 0, numnodes - 1
         nze = recv_count(2, src_p)
         !CALL dbcsr_data_set (replicated%wms(1)%data_area, blk_ps, nze,&
         !     recv_data, rd_disp(src_p))
         offset = rd_disp(src_p) - 1
         DO blk_l = 1, recv_count(1, src_p)
            IF (dbg) WRITE (*, *) "src_p, blk_l", src_p, blk_l
            stored_row = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1))
            stored_col = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 1)
            stored_blk_p = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 2)
            replicated%wms(1)%row_i(blks) = stored_row
            replicated%wms(1)%col_i(blks) = stored_col
            replicated%wms(1)%blk_p(blks) = SIGN(ABS(stored_blk_p) + offset, &
                                                 stored_blk_p)
            nze = row_blk_size(stored_row) &
                  *col_blk_size(stored_col)
            blk_ps = MAX(blk_ps, ABS(stored_blk_p) + nze + offset)
            blks = blks + 1
         END DO
      END DO
      CALL dbcsr_data_set_size_referenced(replicated%wms(1)%data_area, blk_ps - 1)
      !
      replicated%wms(1)%lastblk = blks - 1
      replicated%wms(1)%datasize = blk_ps - 1
      CALL dbcsr_finalize(replicated, reshuffle=.TRUE.)
      !
      ! Remove communicators if they were forcibly created.
      IF (had_subcomms .AND. &
          (rep_type .EQ. dbcsr_repl_row .OR. rep_type .EQ. dbcsr_repl_col)) THEN
         CALL dbcsr_mp_grid_remove(mp_obj)
      END IF
      DEALLOCATE (recv_count)
      DEALLOCATE (rd_disp)
      DEALLOCATE (rm_disp)
      CALL dbcsr_data_release(recv_data)
      DEALLOCATE (recv_meta)
      DEALLOCATE (send_meta)
      matrix%replication_type = replicated%replication_type
      ! Now replace the data and index
      CALL dbcsr_switch_data_area(matrix, replicated%data_area, &
                                  previous_data_area=tmp_data)
      CALL dbcsr_switch_data_area(replicated, tmp_data)
      CALL dbcsr_data_release(tmp_data)
      tmp_index => matrix%index
      matrix%index => replicated%index
      replicated%index => tmp_index
      CALL dbcsr_repoint_index(matrix)
      matrix%nze = replicated%nze
      matrix%nblks = replicated%nblks
      CALL dbcsr_release(replicated)
      CALL dbcsr_verify_matrix(matrix)
      CALL timestop(handle)
   END SUBROUTINE dbcsr_replicate

   SUBROUTINE dbcsr_complete_redistribute(matrix, redist, keep_sparsity, summation)
      !! Fully redistributes a DBCSR matrix.
      !! The new distribution may be arbitrary as long as the total
      !! number full rows and columns matches that of the existing
      !! matrix.

      TYPE(dbcsr_type), INTENT(IN)                       :: matrix
         !! matrix to redistribute
      TYPE(dbcsr_type), INTENT(INOUT)                    :: redist
         !! redistributed matrix
      LOGICAL, INTENT(IN), OPTIONAL                      :: keep_sparsity, summation
         !! retains the sparsity of the redist matrix
         !! sum blocks with identical row and col from different processes

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_complete_redistribute'
      INTEGER, PARAMETER                                 :: metalen = 7
      LOGICAL, PARAMETER                                 :: dbg = .FALSE.

      INTEGER :: blk, blk_col_new, blk_ps, blk_row_new, blks, cnt_fnd, cnt_new, cnt_skip, col, &
                 col_int, col_offset_new, col_offset_old, col_rle, col_size, col_size_new, data_offset_l, &
                 data_type, dst_p, handle, i, meta_l, numnodes, nze_rle, row, row_int, &
                 row_offset_new, row_offset_old, row_rle, row_size, row_size_new, src_p, stored_col_new, &
                 stored_row_new
      INTEGER, ALLOCATABLE, DIMENSION(:) :: col_end_new, col_end_old, col_start_new, &
                                            col_start_old, rd_disp, recv_meta, rm_disp, row_end_new, row_end_old, row_start_new, &
                                            row_start_old, sd_disp, sdp, send_meta, sm_disp, smp
      INTEGER, ALLOCATABLE, DIMENSION(:, :) :: col_reblocks, n_col_reblocks, n_row_reblocks, &
                                               recv_count, row_reblocks, send_count, total_recv_count, total_send_count
      INTEGER, DIMENSION(:), POINTER                     :: col_blk_size_new, col_blk_size_old, &
                                                            col_dist_new, row_blk_size_new, &
                                                            row_blk_size_old, row_dist_new
      INTEGER, DIMENSION(:, :), POINTER                  :: pgrid
      LOGICAL                                            :: found, my_keep_sparsity, my_summation, &
                                                            sym, tr, valid_block
      REAL(kind=dp)                                      :: cs1, cs2
      TYPE(dbcsr_data_obj)                               :: buff_data, data_block, recv_data, &
                                                            send_data
      TYPE(dbcsr_distribution_obj)                       :: dist_new
      TYPE(dbcsr_iterator)                               :: iter
      TYPE(dbcsr_mp_obj)                                 :: mp_obj_new
      TYPE(mp_comm_type)                                 :: mp_group

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)

      IF (.NOT. dbcsr_valid_index(matrix)) &
         DBCSR_ABORT("Input not valid.")
      IF (matrix%replication_type .NE. dbcsr_repl_none) &
         DBCSR_WARN("Can not redistribute replicated matrix.")
      IF (dbcsr_has_symmetry(matrix) .AND. .NOT. dbcsr_has_symmetry(redist)) &
         DBCSR_ABORT("Can not redistribute a symmetric matrix into a non-symmetric one")
      !
      my_keep_sparsity = .FALSE.
      IF (PRESENT(keep_sparsity)) my_keep_sparsity = keep_sparsity
      !
      my_summation = .FALSE.
      IF (PRESENT(summation)) my_summation = summation

      ! zero blocks that might be present in the target (redist) but not in the source (matrix)
      CALL dbcsr_set(redist, 0.0_dp)

      sym = dbcsr_has_symmetry(redist)
      data_type = matrix%data_type
      ! Get row and column start and end positions
      ! Old matrix
      row_blk_size_old => array_data(matrix%row_blk_size)
      col_blk_size_old => array_data(matrix%col_blk_size)
      ALLOCATE (row_start_old(dbcsr_nblkrows_total(matrix)), &
                row_end_old(dbcsr_nblkrows_total(matrix)), &
                col_start_old(dbcsr_nblkcols_total(matrix)), &
                col_end_old(dbcsr_nblkcols_total(matrix)))
      CALL convert_sizes_to_offsets(row_blk_size_old, &
                                    row_start_old, row_end_old)
      CALL convert_sizes_to_offsets(col_blk_size_old, &
                                    col_start_old, col_end_old)
      ! New matrix
      dist_new = dbcsr_distribution(redist)
      row_blk_size_new => array_data(redist%row_blk_size)
      col_blk_size_new => array_data(redist%col_blk_size)
      ALLOCATE (row_start_new(dbcsr_nblkrows_total(redist)), &
                row_end_new(dbcsr_nblkrows_total(redist)), &
                col_start_new(dbcsr_nblkcols_total(redist)), &
                col_end_new(dbcsr_nblkcols_total(redist)))
      CALL convert_sizes_to_offsets(row_blk_size_new, &
                                    row_start_new, row_end_new)
      CALL convert_sizes_to_offsets(col_blk_size_new, &
                                    col_start_new, col_end_new)
      row_dist_new => dbcsr_distribution_row_dist(dist_new)
      col_dist_new => dbcsr_distribution_col_dist(dist_new)
      ! Create mappings
      i = dbcsr_nfullrows_total(redist)
      ALLOCATE (row_reblocks(4, i))
      ALLOCATE (n_row_reblocks(2, dbcsr_nblkrows_total(matrix)))
      CALL dbcsr_reblocking_targets(row_reblocks, i, n_row_reblocks, &
                                    row_blk_size_old, row_blk_size_new)
      i = dbcsr_nfullcols_total(redist)
      ALLOCATE (col_reblocks(4, i))
      ALLOCATE (n_col_reblocks(2, dbcsr_nblkcols_total(matrix)))
      CALL dbcsr_reblocking_targets(col_reblocks, i, n_col_reblocks, &
                                    col_blk_size_old, col_blk_size_new)
      !
      mp_obj_new = dbcsr_distribution_mp(dist_new)
      pgrid => dbcsr_mp_pgrid(mp_obj_new)
      numnodes = dbcsr_mp_numnodes(mp_obj_new)
      mp_group = dbcsr_mp_group(mp_obj_new)
      !
      IF (MAXVAL(row_dist_new) > UBOUND(pgrid, 1)) &
         DBCSR_ABORT('Row distribution references unexistent processor rows')
      IF (dbg) THEN
         IF (MAXVAL(row_dist_new) .NE. UBOUND(pgrid, 1)) &
            DBCSR_WARN('Range of row distribution not equal to processor rows')
      END IF
      IF (MAXVAL(col_dist_new) > UBOUND(pgrid, 2)) &
         DBCSR_ABORT('Col distribution references unexistent processor cols')
      IF (dbg) THEN
         IF (MAXVAL(col_dist_new) .NE. UBOUND(pgrid, 2)) &
            DBCSR_WARN('Range of col distribution not equal to processor cols')
      END IF
      ALLOCATE (send_count(2, 0:numnodes - 1))
      ALLOCATE (recv_count(2, 0:numnodes - 1))
      ALLOCATE (total_send_count(2, 0:numnodes - 1))
      ALLOCATE (total_recv_count(2, 0:numnodes - 1))
      ALLOCATE (sdp(0:numnodes - 1))
      ALLOCATE (sd_disp(0:numnodes - 1))
      ALLOCATE (smp(0:numnodes - 1))
      ALLOCATE (sm_disp(0:numnodes - 1))
      ALLOCATE (rd_disp(0:numnodes - 1))
      ALLOCATE (rm_disp(0:numnodes - 1))
      IF (dbg) THEN
         cs1 = dbcsr_checksum(matrix)
      END IF
      !cs1 = dbcsr_checksum (matrix)
      !call dbcsr_print(matrix)
      !
      !
      ! Count initial sizes for sending.
      !
      ! We go through every element of every local block and determine
      ! to which processor it must be sent. It could be more efficient,
      ! but at least the index data are run-length encoded.
      send_count(:, :) = 0
      CALL dbcsr_iterator_start(iter, matrix)
      dst_p = -1
      DO WHILE (dbcsr_iterator_blocks_left(iter))
         CALL dbcsr_iterator_next_block(iter, row, col, blk)
         DO col_int = n_col_reblocks(1, col), &
            n_col_reblocks(1, col) + n_col_reblocks(2, col) - 1
            blk_col_new = col_reblocks(1, col_int)
            DO row_int = n_row_reblocks(1, row), &
               n_row_reblocks(1, row) + n_row_reblocks(2, row) - 1
               blk_row_new = row_reblocks(1, row_int)
               IF (.NOT. sym .OR. blk_col_new .GE. blk_row_new) THEN
                  tr = .FALSE.
                  CALL dbcsr_get_stored_coordinates(redist, &
                                                    blk_row_new, blk_col_new, dst_p)
                  send_count(1, dst_p) = send_count(1, dst_p) + 1
                  send_count(2, dst_p) = send_count(2, dst_p) + &
                                         col_reblocks(2, col_int)*row_reblocks(2, row_int)
               END IF
            END DO
         END DO
      END DO
      CALL dbcsr_iterator_stop(iter)
      !
      !
      CALL mp_alltoall(send_count, recv_count, 2, mp_group)
      ! Allocate data structures needed for data exchange.
      CALL dbcsr_data_init(recv_data)
      CALL dbcsr_data_new(recv_data, data_type, SUM(recv_count(2, :)))
      ALLOCATE (recv_meta(metalen*SUM(recv_count(1, :))))
      CALL dbcsr_data_init(send_data)
      CALL dbcsr_data_new(send_data, data_type, SUM(send_count(2, :)))
      ALLOCATE (send_meta(metalen*SUM(send_count(1, :))))
      ! Fill in the meta data structures and copy the data.
      DO dst_p = 0, numnodes - 1
         total_send_count(1, dst_p) = send_count(1, dst_p)
         total_send_count(2, dst_p) = send_count(2, dst_p)
         total_recv_count(1, dst_p) = recv_count(1, dst_p)
         total_recv_count(2, dst_p) = recv_count(2, dst_p)
      END DO
      sd_disp = -1; sm_disp = -1
      rd_disp = -1; rm_disp = -1
      sd_disp(0) = 1; sm_disp(0) = 1
      rd_disp(0) = 1; rm_disp(0) = 1
      DO dst_p = 1, numnodes - 1
         sm_disp(dst_p) = sm_disp(dst_p - 1) &
                          + metalen*total_send_count(1, dst_p - 1)
         sd_disp(dst_p) = sd_disp(dst_p - 1) &
                          + total_send_count(2, dst_p - 1)
         rm_disp(dst_p) = rm_disp(dst_p - 1) &
                          + metalen*total_recv_count(1, dst_p - 1)
         rd_disp(dst_p) = rd_disp(dst_p - 1) &
                          + total_recv_count(2, dst_p - 1)
      END DO
      sdp(:) = sd_disp     ! sdp points to the the next place to store
      ! data. It is postincremented.
      smp(:) = sm_disp - metalen  ! But smp points to the "working" data, not
      ! the next. It is pre-incremented, so we must
      ! first rewind it.
      !
      CALL dbcsr_data_init(data_block)
      CALL dbcsr_data_new(data_block, data_type)
      CALL dbcsr_iterator_start(iter, matrix)
      dst_p = -1
      DO WHILE (dbcsr_iterator_blocks_left(iter))
         CALL dbcsr_iterator_next_block(iter, row, col, data_block, tr, blk, &
                                        row_size=row_size, col_size=col_size)
         !IF (tr) WRITE(*,*)"block at",row,col," is transposed"
         DO col_int = n_col_reblocks(1, col), &
            n_col_reblocks(1, col) + n_col_reblocks(2, col) - 1
            blk_col_new = col_reblocks(1, col_int)
            DO row_int = n_row_reblocks(1, row), &
               n_row_reblocks(1, row) + n_row_reblocks(2, row) - 1
               blk_row_new = row_reblocks(1, row_int)
               loc_ok: IF (.NOT. sym .OR. blk_col_new .GE. blk_row_new) THEN
                  IF (dbg) &
                     WRITE (*, *) 'using block', blk_row_new, 'x', blk_col_new
                  ! Start a new RLE run
                  tr = .FALSE.
                  CALL dbcsr_get_stored_coordinates(redist, &
                                                    blk_row_new, blk_col_new, dst_p)
                  row_offset_old = row_reblocks(3, row_int)
                  col_offset_old = col_reblocks(3, col_int)
                  row_offset_new = row_reblocks(4, row_int)
                  col_offset_new = col_reblocks(4, col_int)
                  row_rle = row_reblocks(2, row_int)
                  col_rle = col_reblocks(2, col_int)
                  smp(dst_p) = smp(dst_p) + metalen
                  send_meta(smp(dst_p)) = blk_row_new   ! new blocked row
                  send_meta(smp(dst_p) + 1) = blk_col_new ! new blocked column
                  send_meta(smp(dst_p) + 2) = row_offset_new  ! row in new block
                  send_meta(smp(dst_p) + 3) = col_offset_new  ! col in new block
                  send_meta(smp(dst_p) + 4) = row_rle ! RLE rows
                  send_meta(smp(dst_p) + 5) = col_rle ! RLE columns
                  send_meta(smp(dst_p) + 6) = sdp(dst_p) - sd_disp(dst_p) ! Offset in data
                  nze_rle = row_rle*col_rle
                  ! Copy current block into the send buffer
                  CALL dbcsr_block_partial_copy( &
                     send_data, dst_offset=sdp(dst_p) - 1, &
                     dst_rs=row_rle, dst_cs=col_rle, dst_tr=.FALSE., &
                     dst_r_lb=1, dst_c_lb=1, &
                     src=data_block, &
                     src_rs=row_size, src_cs=col_size, src_tr=tr, &
                     src_r_lb=row_offset_old, src_c_lb=col_offset_old, &
                     nrow=row_rle, ncol=col_rle)
                  sdp(dst_p) = sdp(dst_p) + nze_rle
               END IF loc_ok
            END DO ! row_int
         END DO ! col_int
      END DO
      CALL dbcsr_iterator_stop(iter)
      CALL dbcsr_data_clear_pointer(data_block)
      CALL dbcsr_data_release(data_block)

      ! Exchange the data and metadata structures.
      !
      SELECT CASE (data_type)
      CASE (dbcsr_type_real_4)
         CALL hybrid_alltoall_s1( &
            send_data%d%r_sp(:), total_send_count(2, :), sd_disp(:) - 1, &
            recv_data%d%r_sp(:), total_recv_count(2, :), rd_disp(:) - 1, &
            mp_obj_new)
      CASE (dbcsr_type_real_8)
         !CALL mp_alltoall(&
         !     send_data%d%r_dp(:), total_send_count(2,:), sd_disp(:)-1,&
         !     recv_data%d%r_dp(:), total_recv_count(2,:), rd_disp(:)-1,&
         !     mp_group)
         CALL hybrid_alltoall_d1( &
            send_data%d%r_dp(:), total_send_count(2, :), sd_disp(:) - 1, &
            recv_data%d%r_dp(:), total_recv_count(2, :), rd_disp(:) - 1, &
            mp_obj_new)
      CASE (dbcsr_type_complex_4)
         CALL hybrid_alltoall_c1( &
            send_data%d%c_sp(:), total_send_count(2, :), sd_disp(:) - 1, &
            recv_data%d%c_sp(:), total_recv_count(2, :), rd_disp(:) - 1, &
            mp_obj_new)
      CASE (dbcsr_type_complex_8)
         CALL hybrid_alltoall_z1( &
            send_data%d%c_dp(:), total_send_count(2, :), sd_disp(:) - 1, &
            recv_data%d%c_dp(:), total_recv_count(2, :), rd_disp(:) - 1, &
            mp_obj_new)
      CASE default
         DBCSR_ABORT("Invalid matrix type")
      END SELECT
      CALL hybrid_alltoall_i1(send_meta(:), metalen*total_send_count(1, :), sm_disp(:) - 1, &
                              recv_meta(:), metalen*total_recv_count(1, :), rm_disp(:) - 1, mp_obj_new)
      !
      ! Now fill in the data.
      CALL dbcsr_work_create(redist, &
                             nblks_guess=SUM(recv_count(1, :)), &
                             sizedata_guess=SUM(recv_count(2, :)), work_mutable=.TRUE.)
      CALL dbcsr_data_init(buff_data)
      CALL dbcsr_data_init(data_block)
      CALL dbcsr_data_new(buff_data, dbcsr_type_1d_to_2d(data_type), &
                          redist%max_rbs, redist%max_cbs)
      CALL dbcsr_data_new(data_block, dbcsr_type_1d_to_2d(data_type))

      !blk_p = 1
      !blk = 1
      blk_ps = 0
      blks = 0
      cnt_fnd = 0; cnt_new = 0; cnt_skip = 0
      DO src_p = 0, numnodes - 1
         data_offset_l = rd_disp(src_p)
         DO meta_l = 1, recv_count(1, src_p)
            stored_row_new = recv_meta(rm_disp(src_p) + metalen*(meta_l - 1))
            stored_col_new = recv_meta(rm_disp(src_p) + metalen*(meta_l - 1) + 1)
            row_offset_new = recv_meta(rm_disp(src_p) + metalen*(meta_l - 1) + 2)
            col_offset_new = recv_meta(rm_disp(src_p) + metalen*(meta_l - 1) + 3)
            row_rle = recv_meta(rm_disp(src_p) + metalen*(meta_l - 1) + 4)
            col_rle = recv_meta(rm_disp(src_p) + metalen*(meta_l - 1) + 5)
            data_offset_l = rd_disp(src_p) &
                            + recv_meta(rm_disp(src_p) + metalen*(meta_l - 1) + 6)

            CALL dbcsr_data_clear_pointer(data_block)
            CALL dbcsr_get_block_p(redist, stored_row_new, stored_col_new, &
                                   data_block, tr, found)
            valid_block = found

            IF (found) cnt_fnd = cnt_fnd + 1
            IF (.NOT. found .AND. .NOT. my_keep_sparsity) THEN
               ! We have to set up a buffer block
               CALL dbcsr_data_set_pointer(data_block, &
                                           rsize=row_blk_size_new(stored_row_new), &
                                           csize=col_blk_size_new(stored_col_new), &
                                           pointee=buff_data)
               CALL dbcsr_data_clear(data_block)
               !r2_dp => r2_dp_buff(1:row_blk_size_new (stored_row_new),&
               !     1:col_blk_size_new (stored_col_new))
               !r2_dp(:,:) = 0.0_dp
               tr = .FALSE.
               blks = blks + 1
               blk_ps = blk_ps + row_blk_size_new(stored_row_new)* &
                        col_blk_size_new(stored_col_new)
               valid_block = .TRUE.
               cnt_new = cnt_new + 1
            END IF
            nze_rle = row_rle*col_rle

            IF (valid_block) THEN
               row_size_new = row_blk_size_new(stored_row_new)
               col_size_new = col_blk_size_new(stored_col_new)
               CALL dbcsr_block_partial_copy( &
                  dst=data_block, dst_tr=tr, &
                  dst_rs=row_size_new, dst_cs=col_size_new, &
                  dst_r_lb=row_offset_new, dst_c_lb=col_offset_new, &
                  src=recv_data, src_offset=data_offset_l - 1, &
                  src_rs=row_rle, src_cs=col_rle, src_tr=.FALSE., &
                  src_r_lb=1, src_c_lb=1, &
                  nrow=row_rle, ncol=col_rle)
            ELSE
               cnt_skip = cnt_skip + 1
            END IF

            data_offset_l = data_offset_l + nze_rle
            IF ((.NOT. found .OR. my_summation) .AND. valid_block) THEN
               IF (dbg) WRITE (*, *) routineN//" Adding new block at", &
                  stored_row_new, stored_col_new
               CALL dbcsr_put_block(redist, stored_row_new, stored_col_new, &
                                    data_block, transposed=tr, summation=my_summation)
               !DEALLOCATE (r2_dp)
            ELSE
               IF (.NOT. my_keep_sparsity .AND. dbg) &
                  WRITE (*, *) routineN//" Reusing block at", &
                  stored_row_new, stored_col_new
            END IF
         END DO
      END DO

      CALL dbcsr_data_clear_pointer(data_block)
      CALL dbcsr_data_release(buff_data)
      CALL dbcsr_data_release(data_block)
      !
      IF (dbg) THEN
         WRITE (*, *) routineN//" Declared blocks=", redist%wms(1)%lastblk, &
            "actual=", blks
         WRITE (*, *) routineN//" Declared data size=", redist%wms(1)%datasize, &
            "actual=", blk_ps
      END IF

      CALL dbcsr_finalize(redist)

      DEALLOCATE (send_count)
      DEALLOCATE (recv_count)
      DEALLOCATE (sdp); DEALLOCATE (sd_disp)
      DEALLOCATE (smp); DEALLOCATE (sm_disp)
      DEALLOCATE (rd_disp)
      DEALLOCATE (rm_disp)

      CALL dbcsr_data_release(recv_data)
      CALL dbcsr_data_release(send_data)

      DEALLOCATE (recv_meta)
      DEALLOCATE (send_meta)

      !if (dbg) call dbcsr_print(redist)
      IF (dbg) THEN
         cs2 = dbcsr_checksum(redist)
         WRITE (*, *) routineN//" Checksums=", cs1, cs2, cs1 - cs2
      END IF
      !IF(cs1-cs2 > 0.00001) DBCSR_ABORT("Mangled data!")
      CALL timestop(handle)
   END SUBROUTINE dbcsr_complete_redistribute

   SUBROUTINE dbcsr_redistribute(matrix, redist)
      !! Redistributes a DBCSR matrix.
      !! The new distribution should have compatible row and column blocks.

      TYPE(dbcsr_type), INTENT(IN)                       :: matrix
         !! matrix to redistribute
      TYPE(dbcsr_type), INTENT(INOUT)                    :: redist
         !! redistributed matrix, which should already be created

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_redistribute'
      INTEGER, PARAMETER                                 :: metalen = 2
      LOGICAL, PARAMETER                                 :: dbg = .FALSE.

      INTEGER :: blk, blk_ps, blks, col, col_size, data_type, dst_p, handle, meta_l, &
                 numnodes, nze, row, row_size, src_p, stored_col_new, stored_row_new
      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: rd_disp, recv_meta, rm_disp, sd_disp, &
                                                            sdp, send_meta, sm_disp, smp
      INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: recv_count, send_count, &
                                                            total_recv_count, total_send_count
      INTEGER, DIMENSION(:), POINTER                     :: col_blk_size_new, col_dist_new, &
                                                            row_blk_size_new, row_dist_new
      INTEGER, DIMENSION(:, :), POINTER                  :: pgrid
      LOGICAL                                            :: sym_tr, tr
      TYPE(dbcsr_data_obj)                               :: data_block, recv_data, send_data
      TYPE(dbcsr_distribution_obj)                       :: dist_new
      TYPE(dbcsr_iterator)                               :: iter
      TYPE(dbcsr_mp_obj)                                 :: mp_obj_new
      TYPE(mp_comm_type)                                 :: mp_group

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)
      !call dbcsr_print_dist (matrix%dist)
      !call dbcsr_print_dist (redist%dist)
      IF (.NOT. dbcsr_valid_index(matrix)) &
         DBCSR_ABORT("Input not valid.")
      IF (matrix%replication_type .NE. dbcsr_repl_none) &
         DBCSR_WARN("Can not redistribute replicated matrix.")
      data_type = matrix%data_type
      ! Get row and column start and end positions
      ! Old matrix
      ! New matrix
      dist_new = dbcsr_distribution(redist)
      row_blk_size_new => array_data(redist%row_blk_size)
      col_blk_size_new => array_data(redist%col_blk_size)
      row_dist_new => dbcsr_distribution_row_dist(dist_new)
      col_dist_new => dbcsr_distribution_col_dist(dist_new)
      !
      mp_obj_new = dbcsr_distribution_mp(dist_new)
      pgrid => dbcsr_mp_pgrid(mp_obj_new)
      numnodes = dbcsr_mp_numnodes(mp_obj_new)
      mp_group = dbcsr_mp_group(mp_obj_new)
      !
      IF (MAXVAL(row_dist_new) .GT. UBOUND(pgrid, 1)) &
         DBCSR_ABORT('Row distribution references unexistent processor rows')
      IF (dbg) THEN
         IF (MAXVAL(row_dist_new) .NE. UBOUND(pgrid, 1)) &
            DBCSR_WARN('Range of row distribution not equal to processor rows')
      END IF
      IF (MAXVAL(col_dist_new) .GT. UBOUND(pgrid, 2)) &
         DBCSR_ABORT('Col distribution references unexistent processor cols')
      IF (dbg) THEN
         IF (MAXVAL(col_dist_new) .NE. UBOUND(pgrid, 2)) &
            DBCSR_WARN('Range of col distribution not equal to processor cols')
      END IF
      ALLOCATE (send_count(2, 0:numnodes - 1))
      ALLOCATE (recv_count(2, 0:numnodes - 1))
      ALLOCATE (total_send_count(2, 0:numnodes - 1))
      ALLOCATE (total_recv_count(2, 0:numnodes - 1))
      ALLOCATE (sdp(0:numnodes - 1))
      ALLOCATE (sd_disp(0:numnodes - 1))
      ALLOCATE (smp(0:numnodes - 1))
      ALLOCATE (sm_disp(0:numnodes - 1))
      ALLOCATE (rd_disp(0:numnodes - 1))
      ALLOCATE (rm_disp(0:numnodes - 1))
      ! Count initial sizes for sending.
      !
      send_count(:, :) = 0
      CALL dbcsr_iterator_start(iter, matrix)
      dst_p = -1
      DO WHILE (dbcsr_iterator_blocks_left(iter))
         CALL dbcsr_iterator_next_block(iter, row, col, blk, tr, &
                                        row_size=row_size, col_size=col_size)
         sym_tr = .FALSE.
         CALL dbcsr_get_stored_coordinates(redist, &
                                           row, col, dst_p)
         nze = row_size*col_size
         send_count(1, dst_p) = send_count(1, dst_p) + 1
         send_count(2, dst_p) = send_count(2, dst_p) + nze
      END DO
      CALL dbcsr_iterator_stop(iter)
      CALL mp_alltoall(send_count, recv_count, 2, mp_group)
      ! Allocate data structures needed for data exchange.
      CALL dbcsr_data_init(recv_data)
      CALL dbcsr_data_new(recv_data, data_type, SUM(recv_count(2, :)))
      ALLOCATE (recv_meta(metalen*SUM(recv_count(1, :))))
      CALL dbcsr_data_init(send_data)
      CALL dbcsr_data_new(send_data, data_type, SUM(send_count(2, :)))
      ALLOCATE (send_meta(metalen*SUM(send_count(1, :))))
      ! Fill in the meta data structures and copy the data.
      DO dst_p = 0, numnodes - 1
         total_send_count(1, dst_p) = send_count(1, dst_p)
         total_send_count(2, dst_p) = send_count(2, dst_p)
         total_recv_count(1, dst_p) = recv_count(1, dst_p)
         total_recv_count(2, dst_p) = recv_count(2, dst_p)
      END DO
      sd_disp = -1; sm_disp = -1
      rd_disp = -1; rm_disp = -1
      sd_disp(0) = 1; sm_disp(0) = 1
      rd_disp(0) = 1; rm_disp(0) = 1
      DO dst_p = 1, numnodes - 1
         sm_disp(dst_p) = sm_disp(dst_p - 1) &
                          + metalen*total_send_count(1, dst_p - 1)
         sd_disp(dst_p) = sd_disp(dst_p - 1) &
                          + total_send_count(2, dst_p - 1)
         rm_disp(dst_p) = rm_disp(dst_p - 1) &
                          + metalen*total_recv_count(1, dst_p - 1)
         rd_disp(dst_p) = rd_disp(dst_p - 1) &
                          + total_recv_count(2, dst_p - 1)
      END DO
      sdp(:) = sd_disp     ! sdp points to the the next place to store
      ! data. It is postincremented.
      smp(:) = sm_disp - metalen  ! But smp points to the "working" data, not
      ! the next. It is pre-incremented, so we must
      ! first rewind it.
      CALL dbcsr_data_init(data_block)
      CALL dbcsr_data_new(data_block, data_type)
      CALL dbcsr_iterator_start(iter, matrix)
      dst_p = -1
      DO WHILE (dbcsr_iterator_blocks_left(iter))
         CALL dbcsr_iterator_next_block(iter, row, col, data_block, tr, blk)
         !IF (tr) WRITE(*,*)"block at",row,col," is transposed"
         sym_tr = .FALSE.
         CALL dbcsr_get_stored_coordinates(redist, &
                                           row, col, dst_p)
         smp(dst_p) = smp(dst_p) + metalen
         IF (tr) THEN
            send_meta(smp(dst_p)) = -row
         ELSE
            send_meta(smp(dst_p)) = row
         END IF
         send_meta(smp(dst_p) + 1) = col ! new blocked column
         nze = dbcsr_data_get_size(data_block)
         CALL dbcsr_data_set(send_data, lb=sdp(dst_p), data_size=nze, &
                             src=data_block, source_lb=1)
         !send_data(sdp(dst_p):sdp(dst_p)+SIZE(r_dp)-1) &
         !     = r_dp(:)
         sdp(dst_p) = sdp(dst_p) + nze
      END DO
      CALL dbcsr_iterator_stop(iter)
      CALL dbcsr_data_clear_pointer(data_block)
      ! Exchange the data and metadata structures.
      SELECT CASE (data_type)
      CASE (dbcsr_type_real_4)
         CALL hybrid_alltoall_s1( &
            send_data%d%r_sp(:), total_send_count(2, :), sd_disp(:) - 1, &
            recv_data%d%r_sp(:), total_recv_count(2, :), rd_disp(:) - 1, &
            mp_obj_new)
      CASE (dbcsr_type_real_8)
         !CALL mp_alltoall(&
         !     send_data%d%r_dp(:), total_send_count(2,:), sd_disp(:)-1,&
         !     recv_data%d%r_dp(:), total_recv_count(2,:), rd_disp(:)-1,&
         !     mp_group)
         CALL hybrid_alltoall_d1( &
            send_data%d%r_dp(:), total_send_count(2, :), sd_disp(:) - 1, &
            recv_data%d%r_dp(:), total_recv_count(2, :), rd_disp(:) - 1, &
            mp_obj_new)
      CASE (dbcsr_type_complex_4)
         CALL hybrid_alltoall_c1( &
            send_data%d%c_sp(:), total_send_count(2, :), sd_disp(:) - 1, &
            recv_data%d%c_sp(:), total_recv_count(2, :), rd_disp(:) - 1, &
            mp_obj_new)
      CASE (dbcsr_type_complex_8)
         CALL hybrid_alltoall_z1( &
            send_data%d%c_dp(:), total_send_count(2, :), sd_disp(:) - 1, &
            recv_data%d%c_dp(:), total_recv_count(2, :), rd_disp(:) - 1, &
            mp_obj_new)
      END SELECT
      !CALL mp_alltoall(send_data(:), total_send_count(2,:), sd_disp(:)-1,&
      !     recv_data(:), total_recv_count(2,:), rd_disp(:)-1, mp_group)
      CALL hybrid_alltoall_i1(send_meta(:), metalen*total_send_count(1, :), sm_disp(:) - 1, &
                              recv_meta(:), metalen*total_recv_count(1, :), rm_disp(:) - 1, mp_obj_new)
      ! Now fill in the data.
      CALL dbcsr_work_create(redist, &
                             SUM(recv_count(1, :)), &
                             SUM(recv_count(2, :)), work_mutable=.FALSE., n=1)
      !
      blk_ps = 1
      blks = 0
      DO src_p = 0, numnodes - 1
         !data_offset_l = rd_disp(src_p)
         DO meta_l = 1, recv_count(1, src_p)
            row = recv_meta(rm_disp(src_p) + metalen*(meta_l - 1))
            tr = row .LT. 0
            stored_row_new = ABS(row)
            stored_col_new = recv_meta(rm_disp(src_p) + metalen*(meta_l - 1) + 1)
            nze = row_blk_size_new(stored_row_new)*col_blk_size_new(stored_col_new)
            !r_dp => recv_data(blk_ps:blk_ps+nze-1)
            !CALL dbcsr_put_block(redist, stored_row_new, stored_col_new, r_dp, tr)
            !### this should be changed to be like the make images (i.e., copy data in finalize, not here & now)
            data_block = pointer_view(data_block, recv_data, blk_ps, nze)
            CALL dbcsr_put_block(redist, stored_row_new, stored_col_new, data_block, transposed=tr)
            blk_ps = blk_ps + nze
            blks = blks + 1
         END DO
      END DO
      CALL dbcsr_data_clear_pointer(data_block)
      CALL dbcsr_data_release(data_block)
      !
      IF (dbg) THEN
         WRITE (*, *) routineN//" Declared blocks=", redist%wms(1)%lastblk, &
            "actual=", blks
         WRITE (*, *) routineN//" Declared data size=", redist%wms(1)%datasize, &
            "actual=", blk_ps
      END IF
      CALL dbcsr_finalize(redist)
      CALL dbcsr_data_release(recv_data)
      CALL dbcsr_data_release(send_data)
      DEALLOCATE (send_count)
      DEALLOCATE (recv_count)
      DEALLOCATE (sdp); DEALLOCATE (sd_disp)
      DEALLOCATE (smp); DEALLOCATE (sm_disp)
      DEALLOCATE (rd_disp)
      DEALLOCATE (rm_disp)
      DEALLOCATE (recv_meta)
      DEALLOCATE (send_meta)
      CALL timestop(handle)
   END SUBROUTINE dbcsr_redistribute

   SUBROUTINE dbcsr_datablock_redistribute(dblk, row_p, col_i, blk_p, &
                                           proc_nblks, proc_darea_sizes, new_matrix)
      !! Redistributes data blocks of a DBCSR matrix read from a file.
      !! This routine should be used with dbcsr_binary_read in the module
      !! dbcsr_io.F

      TYPE(dbcsr_data_obj), INTENT(IN)         :: dblk
         !! data blocks of the DBCSR matrix that the current node possesses after reading the data file
      INTEGER, DIMENSION(:), INTENT(IN), &
         POINTER                                :: row_p, col_i, blk_p, &
                                                   proc_nblks, proc_darea_sizes
         !! row_p of the DBCSR matrix that the current node possesses after reading the data file
         !! col_i of the DBCSR matrix that the current node possesses after reading the data file
         !! blk_p of the DBCSR matrix that the current node possesses after reading the data file
         !! 1D array holding nblks of those nodes of the mp environment, that created the file, whose contents have been read by the
         !! current node of the present mp environment
         !! 1D array holding data_area_size of those nodes of the mp environment, that created the file, whose contents have been
         !! read by the current node of the present mp environment
      TYPE(dbcsr_type), INTENT(INOUT)           :: new_matrix
         !! redistributed matrix

      CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_datablock_redistribute'
      INTEGER, PARAMETER                       :: metalen = 2

      COMPLEX(kind=dp), DIMENSION(:), POINTER, CONTIGUOUS :: c_dp
      COMPLEX(kind=sp), DIMENSION(:), POINTER, CONTIGUOUS :: c_sp
      INTEGER :: bcol, blk, blk_ps, blk_size, blks, brow, col_size, data_type, &
                 dst_p, handle, i, ind, job_count, meta_l, &
                 nblkrows_total, numnodes, row_size, src_p, stored_col_new, &
                 stored_row_new
      INTEGER(kind=int_8)                      :: actual_blk, blkp, end_ind, &
                                                  start_ind
      INTEGER(kind=int_8), ALLOCATABLE, &
         DIMENSION(:)                           :: extra_darea_size, extra_nblks
      INTEGER, ALLOCATABLE, DIMENSION(:)       :: rd_disp, recv_meta, rm_disp, &
                                                  sd_disp, sdp, send_meta, &
                                                  sm_disp, smp
      INTEGER, ALLOCATABLE, DIMENSION(:, :)    :: recv_count, send_count, &
                                                  total_recv_count, &
                                                  total_send_count
      INTEGER, DIMENSION(:), POINTER           :: col_blk_size, row_blk_size
      LOGICAL                                  :: sym_tr, tr
      REAL(kind=dp), DIMENSION(:), POINTER, CONTIGUOUS :: r_dp
      REAL(kind=sp), DIMENSION(:), POINTER, CONTIGUOUS :: r_sp
      TYPE(dbcsr_data_obj)                     :: data_block, recv_data, &
                                                  send_data
      TYPE(dbcsr_distribution_obj)             :: dist
      TYPE(dbcsr_mp_obj)                       :: mp_env
      TYPE(mp_comm_type)                                 :: mp_group

      CALL timeset(routineN, handle)

      dist = dbcsr_distribution(new_matrix)
      mp_env = dbcsr_distribution_mp(dist)
      numnodes = dbcsr_mp_numnodes(mp_env)
      mp_group = dbcsr_mp_group(mp_env)

      data_type = dbcsr_get_data_type(new_matrix)
      nblkrows_total = dbcsr_nblkrows_total(new_matrix)
      row_blk_size => array_data(new_matrix%row_blk_size)
      col_blk_size => array_data(new_matrix%col_blk_size)

      ALLOCATE (send_count(2, 0:numnodes - 1))
      ALLOCATE (recv_count(2, 0:numnodes - 1))
      ALLOCATE (total_send_count(2, 0:numnodes - 1))
      ALLOCATE (total_recv_count(2, 0:numnodes - 1))
      ALLOCATE (sdp(0:numnodes - 1))
      ALLOCATE (sd_disp(0:numnodes - 1))
      ALLOCATE (smp(0:numnodes - 1))
      ALLOCATE (sm_disp(0:numnodes - 1))
      ALLOCATE (rd_disp(0:numnodes - 1))
      ALLOCATE (rm_disp(0:numnodes - 1))
      send_count(:, :) = 0
      dst_p = -1

      job_count = COUNT(proc_nblks .NE. 0)

      ALLOCATE (extra_nblks(job_count))
      ALLOCATE (extra_darea_size(job_count))
      IF (job_count > 0) THEN
         CALL cumsum_l(INT((/0, proc_nblks(1:job_count - 1)/), kind=int_8), extra_nblks)
         CALL cumsum_l(INT((/0, proc_darea_sizes(1:job_count - 1)/), kind=int_8), extra_darea_size)
      END IF

      i = 0
      DO ind = 1, job_count*nblkrows_total
         brow = MOD(ind - 1, nblkrows_total) + 1
         IF (brow .EQ. 1) i = i + 1
         row_size = row_blk_size(brow)
         DO blk = row_p(ind + i - 1) + 1, row_p(ind + i)
            actual_blk = INT(blk, kind=int_8) + extra_nblks(i)
            bcol = col_i(actual_blk)
            col_size = col_blk_size(bcol)
            blk_size = row_size*col_size
            sym_tr = .FALSE.
            CALL dbcsr_get_stored_coordinates(new_matrix, brow, bcol, dst_p)
            send_count(1, dst_p) = send_count(1, dst_p) + 1
            send_count(2, dst_p) = send_count(2, dst_p) + blk_size
         END DO
      END DO
      CALL mp_alltoall(send_count, recv_count, 2, mp_group)
      CALL dbcsr_data_init(recv_data)
      CALL dbcsr_data_new(recv_data, data_type, SUM(recv_count(2, :)))
      ALLOCATE (recv_meta(metalen*SUM(recv_count(1, :))))
      CALL dbcsr_data_init(send_data)
      CALL dbcsr_data_new(send_data, data_type, SUM(send_count(2, :)))
      ALLOCATE (send_meta(metalen*SUM(send_count(1, :))))
      DO dst_p = 0, numnodes - 1
         total_send_count(1, dst_p) = send_count(1, dst_p)
         total_send_count(2, dst_p) = send_count(2, dst_p)
         total_recv_count(1, dst_p) = recv_count(1, dst_p)
         total_recv_count(2, dst_p) = recv_count(2, dst_p)
      END DO
      sd_disp = -1; sm_disp = -1; rd_disp = -1; rm_disp = -1
      sd_disp(0) = 1; sm_disp(0) = 1; rd_disp(0) = 1; rm_disp(0) = 1
      DO dst_p = 1, numnodes - 1
         sm_disp(dst_p) = sm_disp(dst_p - 1) + metalen*total_send_count(1, dst_p - 1)
         sd_disp(dst_p) = sd_disp(dst_p - 1) + total_send_count(2, dst_p - 1)
         rm_disp(dst_p) = rm_disp(dst_p - 1) + metalen*total_recv_count(1, dst_p - 1)
         rd_disp(dst_p) = rd_disp(dst_p - 1) + total_recv_count(2, dst_p - 1)
      END DO
      sdp(:) = sd_disp
      smp(:) = sm_disp - metalen

      SELECT CASE (data_type)
      CASE (dbcsr_type_real_4)
         r_sp => dblk%d%r_sp
      CASE (dbcsr_type_real_8)
         r_dp => dblk%d%r_dp
      CASE (dbcsr_type_complex_4)
         c_sp => dblk%d%c_sp
      CASE (dbcsr_type_complex_8)
         c_dp => dblk%d%c_dp
      END SELECT

      CALL dbcsr_data_init(data_block)
      CALL dbcsr_data_new(data_block, data_type)
      i = 0
      dst_p = -1
      DO ind = 1, job_count*nblkrows_total
         brow = MOD(ind - 1, nblkrows_total) + 1
         IF (brow .EQ. 1) i = i + 1
         row_size = row_blk_size(brow)
         DO blk = row_p(ind + i - 1) + 1, row_p(ind + i)
            actual_blk = INT(blk, kind=int_8) + extra_nblks(i)
            bcol = col_i(actual_blk)
            col_size = col_blk_size(bcol)
            blk_size = row_size*col_size
            blkp = INT(blk_p(actual_blk), kind=int_8)
            start_ind = blkp + extra_darea_size(i)
            end_ind = blkp + extra_darea_size(i) + blk_size - 1

            SELECT CASE (data_type)
            CASE (dbcsr_type_real_4)
               data_block%d%r_sp => r_sp(start_ind:end_ind)
            CASE (dbcsr_type_real_8)
               data_block%d%r_dp => r_dp(start_ind:end_ind)
            CASE (dbcsr_type_complex_4)
               data_block%d%c_sp => c_sp(start_ind:end_ind)
            CASE (dbcsr_type_complex_8)
               data_block%d%c_dp => c_dp(start_ind:end_ind)
            END SELECT
            sym_tr = .FALSE.
            CALL dbcsr_get_stored_coordinates(new_matrix, brow, bcol, dst_p)
            smp(dst_p) = smp(dst_p) + metalen
            tr = .FALSE.
!           IF (tr) THEN
!              send_meta(smp(dst_p)) = -brow
!           ELSE
            send_meta(smp(dst_p)) = brow
!           ENDIF
            send_meta(smp(dst_p) + 1) = bcol
            blk_size = dbcsr_data_get_size(data_block)

            CALL dbcsr_data_set(send_data, lb=sdp(dst_p), &
                                data_size=blk_size, src=data_block, source_lb=1)
            sdp(dst_p) = sdp(dst_p) + blk_size
         END DO
      END DO
      CALL dbcsr_data_clear_pointer(data_block)

      SELECT CASE (data_type)
      CASE (dbcsr_type_real_4)
         CALL hybrid_alltoall_s1( &
            send_data%d%r_sp(:), total_send_count(2, :), sd_disp(:) - 1, &
            recv_data%d%r_sp(:), total_recv_count(2, :), rd_disp(:) - 1, &
            mp_env)
      CASE (dbcsr_type_real_8)
         CALL hybrid_alltoall_d1( &
            send_data%d%r_dp(:), total_send_count(2, :), sd_disp(:) - 1, &
            recv_data%d%r_dp(:), total_recv_count(2, :), rd_disp(:) - 1, &
            mp_env)
      CASE (dbcsr_type_complex_4)
         CALL hybrid_alltoall_c1( &
            send_data%d%c_sp(:), total_send_count(2, :), sd_disp(:) - 1, &
            recv_data%d%c_sp(:), total_recv_count(2, :), rd_disp(:) - 1, &
            mp_env)
      CASE (dbcsr_type_complex_8)
         CALL hybrid_alltoall_z1( &
            send_data%d%c_dp(:), total_send_count(2, :), sd_disp(:) - 1, &
            recv_data%d%c_dp(:), total_recv_count(2, :), rd_disp(:) - 1, &
            mp_env)
      END SELECT
      CALL hybrid_alltoall_i1(send_meta(:), metalen*total_send_count(1, :), sm_disp(:) - 1, &
                              recv_meta(:), metalen*total_recv_count(1, :), rm_disp(:) - 1, mp_env)
      CALL dbcsr_work_create(new_matrix, SUM(recv_count(1, :)), &
                             SUM(recv_count(2, :)), work_mutable=.FALSE., n=1)

      blk_ps = 1
      blks = 0
      DO src_p = 0, numnodes - 1
         DO meta_l = 1, recv_count(1, src_p)
            brow = recv_meta(rm_disp(src_p) + metalen*(meta_l - 1))
            tr = brow .LT. 0
            stored_row_new = ABS(brow)
            stored_col_new = recv_meta(rm_disp(src_p) + metalen*(meta_l - 1) + 1)
            blk_size = row_blk_size(stored_row_new)*col_blk_size(stored_col_new)

            data_block = pointer_view(data_block, recv_data, blk_ps, blk_size)
            CALL dbcsr_put_block(new_matrix, stored_row_new, stored_col_new, data_block, transposed=tr)

            blk_ps = blk_ps + blk_size
            blks = blks + 1
         END DO
      END DO
      CALL dbcsr_data_clear_pointer(data_block)
      DEALLOCATE (data_block%d)
      CALL dbcsr_finalize(new_matrix, reshuffle=.TRUE.)
      CALL dbcsr_data_release(recv_data)
      CALL dbcsr_data_release(send_data)

      DEALLOCATE (send_count)
      DEALLOCATE (recv_count)
      DEALLOCATE (sdp); DEALLOCATE (sd_disp)
      DEALLOCATE (smp); DEALLOCATE (sm_disp)
      DEALLOCATE (rd_disp)
      DEALLOCATE (rm_disp)
      DEALLOCATE (recv_meta)
      DEALLOCATE (send_meta)
      DEALLOCATE (extra_nblks); DEALLOCATE (extra_darea_size)

      CALL timestop(handle)
   CONTAINS

      SUBROUTINE cumsum_l(arr, cumsum)
         INTEGER(kind=int_8), DIMENSION(:), INTENT(IN)      :: arr
         INTEGER(kind=int_8), DIMENSION(:), INTENT(OUT)     :: cumsum

         INTEGER                                            :: i

         IF (SIZE(cumsum) > 0) THEN
            cumsum(1) = arr(1)
            DO i = 2, SIZE(cumsum)
               cumsum(i) = cumsum(i - 1) + arr(i)
            END DO
         END IF
      END SUBROUTINE cumsum_l
   END SUBROUTINE dbcsr_datablock_redistribute

END MODULE dbcsr_transformations
